This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4334 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e46345a43000a399ecf22588c7cccc3fa32c01a3 Author: Nicholas DiPiazza <[email protected]> AuthorDate: Tue Sep 30 12:31:09 2025 -0500 TIKA-4334: replace pipes and server with newer apis - add eval and detector controllers --- tika-server/tika-server-spring/pom.xml | 13 +- .../tika/server/config/TikaConfigLoader.java | 53 +++ .../tika/server/controller/DetectorController.java | 42 ++- .../tika/server/controller/EvalController.java | 215 ++++++++++- .../org/apache/tika/server/util/TikaResource.java | 119 ------ .../main/resources/api/tika-server-openapi.yaml | 43 ++- .../apache/tika/server/IntegrationTestBase.java | 4 +- .../DetectorControllerIntegrationTest.java | 42 ++- .../controller/EvalControllerIntegrationTest.java | 403 +++++++++++++++++++++ .../src/test/resources/test-tika-config.xml | 38 ++ 10 files changed, 801 insertions(+), 171 deletions(-) diff --git a/tika-server/tika-server-spring/pom.xml b/tika-server/tika-server-spring/pom.xml index 9db1c1884..5fbd2c67b 100644 --- a/tika-server/tika-server-spring/pom.xml +++ b/tika-server/tika-server-spring/pom.xml @@ -29,7 +29,7 @@ </parent> <artifactId>tika-server-spring</artifactId> - <name>Apache Tika Spring Boot Server</name> + <name>Apache Tika Spring Web Service</name> <description>Apache Tika Server implemented with Spring Boot and OpenAPI 3.0</description> <dependencyManagement> @@ -94,6 +94,17 @@ <version>${project.parent.version}</version> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-eval-core</artifactId> + <version>${project.parent.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-langdetect-opennlp</artifactId> + <version>${project.parent.version}</version> + </dependency> <dependency> <groupId>org.apache.tika</groupId> diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java new file mode 100644 index 000000000..d388855ff --- /dev/null +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/config/TikaConfigLoader.java @@ -0,0 +1,53 @@ +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one or more + * * contributor license agreements. See the NOTICE file distributed with + * * this work for additional information regarding copyright ownership. + * * The ASF licenses this file to You under the Apache License, Version 2.0 + * * (the "License"); you may not use this file except in compliance with + * * the License. You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + * + */ + +package org.apache.tika.server.config; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.core.env.Environment; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; + +@Configuration +public class TikaConfigLoader { + private final Environment environment; + + @Autowired + public TikaConfigLoader(Environment environment) { + this.environment = environment; + } + + @Bean + public TikaConfig tikaConfig() throws TikaException { + String tikaConfig = environment.getProperty("tika.config"); + if (StringUtils.isNotBlank(tikaConfig)) { + try { + return new TikaConfig(getClass().getClassLoader().getResourceAsStream(tikaConfig)); + } catch (Exception e) { + throw new TikaException("Could not load tika.config profile", e); + } + } + return TikaConfig.getDefaultConfig(); + } +} diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java index dd1422de6..b4f62d083 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/DetectorController.java @@ -18,23 +18,25 @@ package org.apache.tika.server.controller; import java.io.IOException; import java.io.InputStream; +import java.util.Optional; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; import org.springframework.core.io.Resource; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.context.request.NativeWebRequest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; import org.apache.tika.server.api.DetectorResourceApi; import org.apache.tika.server.component.ServerStatus; -import org.apache.tika.server.util.TikaResource; /** * Controller for MIME/media type detection using the default detector. @@ -42,45 +44,51 @@ import org.apache.tika.server.util.TikaResource; */ @RestController public class DetectorController implements DetectorResourceApi { - private static final Logger LOG = LoggerFactory.getLogger(DetectorController.class); private final ServerStatus serverStatus; + private final TikaConfig tikaConfig; + + @Value("${tika.detector.taskTimeoutMillis:30000}") + private long timeoutMillis; + @Autowired - public DetectorController(ServerStatus serverStatus) { + public DetectorController(ServerStatus serverStatus, TikaConfig tikaConfig) { this.serverStatus = serverStatus; + this.tikaConfig = tikaConfig; + } + + @Override + public Optional<NativeWebRequest> getRequest() { + return DetectorResourceApi.super.getRequest(); } @Override - public ResponseEntity<String> putStream(Resource body) { + public ResponseEntity<String> putStream(Resource body, String contentDisposition) { if (body == null) { return ResponseEntity.badRequest().body("No document provided"); } - + Metadata metadata = new Metadata(); String filename = body.getFilename(); LOG.info("Detecting media type for Filename: {}", filename); - + if (filename != null) { metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, filename); } - - ParseContext parseContext = new ParseContext(); - long timeoutMillis = TikaResource.getTaskTimeout(parseContext); + long taskId = serverStatus.start(ServerStatus.TASK.DETECT, filename, timeoutMillis); try (InputStream is = body.getInputStream(); TikaInputStream tis = TikaInputStream.get(is)) { - - String mediaType = TikaResource - .getConfig() + + String mediaType = tikaConfig .getDetector() .detect(tis, metadata) .toString(); - + LOG.info("Detected media type: {} for file: {}", mediaType, filename); return ResponseEntity.ok(mediaType); - } catch (IOException e) { LOG.warn("Unable to detect MIME type for file. Reason: {} ({})", e.getMessage(), filename, e); return ResponseEntity.ok(MediaType.OCTET_STREAM.toString()); @@ -88,11 +96,11 @@ public class DetectorController implements DetectorResourceApi { LOG.error("OOM while detecting: ({})", filename, e); serverStatus.setStatus(ServerStatus.STATUS.ERROR); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR) - .body("Out of memory error during detection"); + .body("Out of memory error during detection"); } catch (Throwable e) { LOG.error("Exception while detecting: ({})", filename, e); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR) - .body("Error during MIME type detection: " + e.getMessage()); + .body("Error during MIME type detection: " + e.getMessage()); } finally { serverStatus.complete(taskId); } diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/EvalController.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/EvalController.java index 0e39c28a0..cc0ed5238 100644 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/EvalController.java +++ b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/controller/EvalController.java @@ -20,30 +20,231 @@ package org.apache.tika.server.controller; -import java.util.Optional; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; -import org.springframework.web.context.request.NativeWebRequest; +import org.springframework.stereotype.Controller; +import org.apache.tika.eval.core.langid.LanguageIDWrapper; +import org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter; +import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator; +import org.apache.tika.eval.core.textstats.CommonTokens; +import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator; +import org.apache.tika.eval.core.textstats.TextStatsCalculator; +import org.apache.tika.eval.core.tokens.CommonTokenResult; +import org.apache.tika.eval.core.tokens.ContrastStatistics; +import org.apache.tika.eval.core.tokens.TokenContraster; +import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.language.detect.LanguageResult; +import org.apache.tika.metadata.Property; import org.apache.tika.server.api.EvalResourceApi; +import org.apache.tika.server.component.ServerStatus; import org.apache.tika.server.model.PutEvalCompare200Response; import org.apache.tika.server.model.PutEvalCompareRequest; import org.apache.tika.server.model.PutEvalProfile200Response; import org.apache.tika.server.model.PutEvalProfileRequest; +import org.apache.tika.utils.StringUtils; +@Controller public class EvalController implements EvalResourceApi { - @Override - public Optional<NativeWebRequest> getRequest() { - return EvalResourceApi.super.getRequest(); + + public static final long DEFAULT_TIMEOUT_MILLIS = 60000; + + public static final Property DICE = Property.externalReal( + TikaEvalMetadataFilter.TIKA_EVAL_NS + "dice"); + + public static final Property OVERLAP = Property.externalReal( + TikaEvalMetadataFilter.TIKA_EVAL_NS + "overlap"); + + static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR; + + static { + List<TextStatsCalculator> calcs = new ArrayList<>(); + calcs.add(new BasicTokenCountStatsCalculator()); + calcs.add(new CommonTokens()); + TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs); } + @Autowired + private ServerStatus serverStatus; + @Override public ResponseEntity<PutEvalCompare200Response> putEvalCompare(PutEvalCompareRequest putEvalCompareRequest) { - return EvalResourceApi.super.putEvalCompare(putEvalCompareRequest); + try { + String id = putEvalCompareRequest.getId(); + String textA = putEvalCompareRequest.getTextA(); + String textB = putEvalCompareRequest.getTextB(); + long timeoutMillis = putEvalCompareRequest.getTimeoutMillis() != null ? + putEvalCompareRequest.getTimeoutMillis() : DEFAULT_TIMEOUT_MILLIS; + + Map<String, Object> result = compareText(id, textA, textB, timeoutMillis); + + PutEvalCompare200Response response = new PutEvalCompare200Response(); + mapResultToCompareResponse(result, response); + + return new ResponseEntity<>(response, HttpStatus.OK); + } catch (Exception e) { + return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR); + } } @Override public ResponseEntity<PutEvalProfile200Response> putEvalProfile(PutEvalProfileRequest putEvalProfileRequest) { - return EvalResourceApi.super.putEvalProfile(putEvalProfileRequest); + try { + String id = putEvalProfileRequest.getId(); + String text = putEvalProfileRequest.getText(); + long timeoutMillis = putEvalProfileRequest.getTimeoutMillis() != null ? + putEvalProfileRequest.getTimeoutMillis() : DEFAULT_TIMEOUT_MILLIS; + + Map<String, Object> result = profile(id, text, timeoutMillis); + + PutEvalProfile200Response response = new PutEvalProfile200Response(); + mapResultToProfileResponse(result, response); + + return new ResponseEntity<>(response, HttpStatus.OK); + } catch (Exception e) { + return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR); + } + } + + private Map<String, Object> compareText(String id, String textA, String textB, long timeoutMillis) { + Map<String, Object> stats = new HashMap<>(); + long taskId = serverStatus.start(ServerStatus.TASK.EVAL, id, timeoutMillis); + try { + TokenCounts tokensA = profile("A", textA, stats); + TokenCounts tokensB = profile("B", textB, stats); + TokenContraster tokenContraster = new TokenContraster(); + ContrastStatistics contrastStatistics = + tokenContraster.calculateContrastStatistics(tokensA, tokensB); + reportContrastStats(contrastStatistics, stats); + } finally { + serverStatus.complete(taskId); + } + return stats; + } + + private Map<String, Object> profile(String id, String text, long timeoutMillis) { + Map<String, Object> stats = new HashMap<>(); + long taskId = serverStatus.start(ServerStatus.TASK.EVAL, id, timeoutMillis); + try { + profile(StringUtils.EMPTY, text, stats); + } finally { + serverStatus.complete(taskId); + } + return stats; + } + + private TokenCounts profile(String suffix, String content, Map<String, Object> stats) { + Map<Class, Object> results = TEXT_STATS_CALCULATOR.calculate(content); + + TokenCounts tokenCounts = (TokenCounts) results.get(BasicTokenCountStatsCalculator.class); + stats.put("tika-eval:numTokens" + suffix, tokenCounts.getTotalTokens()); + stats.put("tika-eval:numUniqueTokens" + suffix, tokenCounts.getTotalUniqueTokens()); + + //common token results + CommonTokenResult commonTokenResult = (CommonTokenResult) results.get(CommonTokens.class); + stats.put("tika-eval:numAlphaTokens" + suffix, commonTokenResult.getAlphabeticTokens()); + stats.put("tika-eval:numUniqueAlphaTokens" + suffix, commonTokenResult.getUniqueAlphabeticTokens()); + if (commonTokenResult.getAlphabeticTokens() > 0) { + stats.put("tika-eval:oov" + suffix, commonTokenResult.getOOV()); + } else { + stats.put("tika-eval:oov" + suffix, -1.0f); + } + + //languages + List<LanguageResult> probabilities = + (List<LanguageResult>) results.get(LanguageIDWrapper.class); + if (probabilities.size() > 0) { + stats.put("tika-eval:lang" + suffix, probabilities.get(0).getLanguage()); + stats.put("tika-eval:langConfidence" + suffix, probabilities.get(0).getRawScore()); + } + return tokenCounts; + } + + private void reportContrastStats(ContrastStatistics contrastStatistics, + Map<String, Object> stats) { + stats.put("tika-eval:dice", contrastStatistics.getDiceCoefficient()); + stats.put("tika-eval:overlap", contrastStatistics.getOverlap()); + //TODO, add topNMore, topNUnique + } + + private void mapResultToCompareResponse(Map<String, Object> result, PutEvalCompare200Response response) { + if (result.get("tika-eval:dice") != null) { + response.setTikaEvalColonDice(((Number) result.get("tika-eval:dice")).floatValue()); + } + if (result.get("tika-eval:overlap") != null) { + response.setTikaEvalColonOverlap(((Number) result.get("tika-eval:overlap")).floatValue()); + } + if (result.get("tika-eval:numTokensA") != null) { + response.setTikaEvalColonNumTokensA(((Number) result.get("tika-eval:numTokensA")).intValue()); + } + if (result.get("tika-eval:numTokensB") != null) { + response.setTikaEvalColonNumTokensB(((Number) result.get("tika-eval:numTokensB")).intValue()); + } + if (result.get("tika-eval:numUniqueTokensA") != null) { + response.setTikaEvalColonNumUniqueTokensA(((Number) result.get("tika-eval:numUniqueTokensA")).intValue()); + } + if (result.get("tika-eval:numUniqueTokensB") != null) { + response.setTikaEvalColonNumUniqueTokensB(((Number) result.get("tika-eval:numUniqueTokensB")).intValue()); + } + if (result.get("tika-eval:numAlphaTokensA") != null) { + response.setTikaEvalColonNumAlphaTokensA(((Number) result.get("tika-eval:numAlphaTokensA")).intValue()); + } + if (result.get("tika-eval:numAlphaTokensB") != null) { + response.setTikaEvalColonNumAlphaTokensB(((Number) result.get("tika-eval:numAlphaTokensB")).intValue()); + } + if (result.get("tika-eval:numUniqueAlphaTokensA") != null) { + response.setTikaEvalColonNumUniqueAlphaTokensA(((Number) result.get("tika-eval:numUniqueAlphaTokensA")).intValue()); + } + if (result.get("tika-eval:numUniqueAlphaTokensB") != null) { + response.setTikaEvalColonNumUniqueAlphaTokensB(((Number) result.get("tika-eval:numUniqueAlphaTokensB")).intValue()); + } + if (result.get("tika-eval:oovA") != null) { + response.setTikaEvalColonOovA(((Number) result.get("tika-eval:oovA")).floatValue()); + } + if (result.get("tika-eval:oovB") != null) { + response.setTikaEvalColonOovB(((Number) result.get("tika-eval:oovB")).floatValue()); + } + if (result.get("tika-eval:langA") != null) { + response.setTikaEvalColonLangA((String) result.get("tika-eval:langA")); + } + if (result.get("tika-eval:langB") != null) { + response.setTikaEvalColonLangB((String) result.get("tika-eval:langB")); + } + if (result.get("tika-eval:langConfidenceA") != null) { + response.setTikaEvalColonLangConfidenceA(((Number) result.get("tika-eval:langConfidenceA")).floatValue()); + } + if (result.get("tika-eval:langConfidenceB") != null) { + response.setTikaEvalColonLangConfidenceB(((Number) result.get("tika-eval:langConfidenceB")).floatValue()); + } + } + + private void mapResultToProfileResponse(Map<String, Object> result, PutEvalProfile200Response response) { + if (result.get("tika-eval:numTokens") != null) { + response.setTikaEvalColonNumTokens(((Number) result.get("tika-eval:numTokens")).intValue()); + } + if (result.get("tika-eval:numUniqueTokens") != null) { + response.setTikaEvalColonNumUniqueTokens(((Number) result.get("tika-eval:numUniqueTokens")).intValue()); + } + if (result.get("tika-eval:numAlphaTokens") != null) { + response.setTikaEvalColonNumAlphaTokens(((Number) result.get("tika-eval:numAlphaTokens")).intValue()); + } + if (result.get("tika-eval:numUniqueAlphaTokens") != null) { + response.setTikaEvalColonNumUniqueAlphaTokens(((Number) result.get("tika-eval:numUniqueAlphaTokens")).intValue()); + } + if (result.get("tika-eval:oov") != null) { + response.setTikaEvalColonOov(((Number) result.get("tika-eval:oov")).floatValue()); + } + if (result.get("tika-eval:lang") != null) { + response.setTikaEvalColonLang((String) result.get("tika-eval:lang")); + } + if (result.get("tika-eval:langConfidence") != null) { + response.setTikaEvalColonLangConfidence(((Number) result.get("tika-eval:langConfidence")).floatValue()); + } } } diff --git a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/util/TikaResource.java b/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/util/TikaResource.java deleted file mode 100644 index d810b14bb..000000000 --- a/tika-server/tika-server-spring/src/main/java/org/apache/tika/server/util/TikaResource.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * - * * Licensed to the Apache Software Foundation (ASF) under one or more - * * contributor license agreements. See the NOTICE file distributed with - * * this work for additional information regarding copyright ownership. - * * The ASF licenses this file to You under the Apache License, Version 2.0 - * * (the "License"); you may not use this file except in compliance with - * * the License. You may obtain a copy of the License at - * * - * * http://www.apache.org/licenses/LICENSE-2.0 - * * - * * Unless required by applicable law or agreed to in writing, software - * * distributed under the License is distributed on an "AS IS" BASIS, - * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * * See the License for the specific language governing permissions and - * * limitations under the License. - * - * - */ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.server.util; - -import org.springframework.stereotype.Component; - -import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.Detector; -import org.apache.tika.parser.ParseContext; - -/** - * Utility class providing access to Tika configuration and common operations. - * This class serves as a bridge between the Spring controllers and Tika core functionality. - */ -@Component -public class TikaResource { - - private static final long DEFAULT_TASK_TIMEOUT_MILLIS = 300000; // 5 minutes - private static TikaConfig tikaConfig; - - static { - try { - // Initialize with default Tika configuration - tikaConfig = TikaConfig.getDefaultConfig(); - } catch (Exception e) { - throw new RuntimeException("Failed to initialize TikaConfig", e); - } - } - - /** - * Get the task timeout from ParseContext, or return default if not configured. - * - * @param parseContext The parse context which may contain timeout configuration - * @return Timeout in milliseconds - */ - public static long getTaskTimeout(ParseContext parseContext) { - // Check if timeout is configured in parse context - if (parseContext != null) { - // Look for timeout configuration - this could be expanded based on actual Tika implementation - Object timeout = parseContext.get(Object.class); // Placeholder - actual implementation would vary - if (timeout instanceof Long) { - return (Long) timeout; - } - } - return DEFAULT_TASK_TIMEOUT_MILLIS; - } - - /** - * Get the Tika configuration instance. - * - * @return TikaConfig instance - */ - public static TikaConfig getConfig() { - return tikaConfig; - } - - /** - * Get the detector from the Tika configuration. - * - * @return Detector instance - */ - public static Detector getDetector() { - return tikaConfig.getDetector(); - } - - /** - * Set a custom TikaConfig (useful for testing or custom configurations). - * - * @param config The TikaConfig to use - */ - public static void setConfig(TikaConfig config) { - tikaConfig = config; - } - - /** - * Reset to default configuration. - */ - public static void resetToDefault() { - try { - tikaConfig = TikaConfig.getDefaultConfig(); - } catch (Exception e) { - throw new RuntimeException("Failed to reset to default TikaConfig", e); - } - } -} diff --git a/tika-server/tika-server-spring/src/main/resources/api/tika-server-openapi.yaml b/tika-server/tika-server-spring/src/main/resources/api/tika-server-openapi.yaml index 6a2d738b4..eee596a91 100644 --- a/tika-server/tika-server-spring/src/main/resources/api/tika-server-openapi.yaml +++ b/tika-server/tika-server-spring/src/main/resources/api/tika-server-openapi.yaml @@ -177,11 +177,20 @@ paths: operationId: put_stream requestBody: content: - '*/*': + application/octet-stream: schema: type: string format: binary required: true + parameters: + - in: header + name: Content-Disposition + description: | + Optional header to provide a filename hint to Tika for more accurate detection. + Example: `attachment; filename="document.pdf"` + schema: + type: string + required: false responses: '200': content: @@ -1284,12 +1293,34 @@ paths: "tika-eval:numAlphaTokensB": type: integer description: Number of alphabetic tokens in text B - "tika-eval:languageA": + "tika-eval:numUniqueAlphaTokensA": + type: integer + description: Number of unique alphabetic tokens in text A + "tika-eval:numUniqueAlphaTokensB": + type: integer + description: Number of unique alphabetic tokens in text B + "tika-eval:oovA": + type: number + format: float + description: Out-of-vocabulary ratio for text A + "tika-eval:oovB": + type: number + format: float + description: Out-of-vocabulary ratio for text B + "tika-eval:langA": type: string description: Detected language for text A - "tika-eval:languageB": + "tika-eval:langB": type: string description: Detected language for text B + "tika-eval:langConfidenceA": + type: number + format: float + description: Language detection confidence for text A + "tika-eval:langConfidenceB": + type: number + format: float + description: Language detection confidence for text B description: If successful, this operation returns HTTP status code 200 with comparison statistics and similarity metrics. '500': description: An error occurred processing the call. @@ -1342,15 +1373,15 @@ paths: "tika-eval:numUniqueAlphaTokens": type: integer description: Number of unique alphabetic tokens - "tika-eval:outOfVocabulary": + "tika-eval:oov": type: number format: float description: Out-of-vocabulary ratio (-1.0 if no alphabetic tokens) - "tika-eval:language": + "tika-eval:lang": type: string description: Detected language code example: "en" - "tika-eval:languageConfidence": + "tika-eval:langConfidence": type: number format: float description: Language detection confidence score diff --git a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/IntegrationTestBase.java b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/IntegrationTestBase.java index e8ebfc607..c030e97eb 100644 --- a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/IntegrationTestBase.java +++ b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/IntegrationTestBase.java @@ -29,7 +29,7 @@ import org.springframework.web.context.WebApplicationContext; * and setup for Spring Boot tests with a random port. */ @SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) -@TestPropertySource(properties = {"tika.config.path=classpath:test-tika-config.xml"}) +@TestPropertySource(properties = {"tika.config=test-tika-config.xml"}) public abstract class IntegrationTestBase { @LocalServerPort @@ -45,4 +45,4 @@ public abstract class IntegrationTestBase { protected String getBaseUrl() { return "http://localhost:" + port; } -} \ No newline at end of file +} diff --git a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/DetectorControllerIntegrationTest.java b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/DetectorControllerIntegrationTest.java index 3a2cca8e9..015c732e9 100644 --- a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/DetectorControllerIntegrationTest.java +++ b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/DetectorControllerIntegrationTest.java @@ -27,8 +27,6 @@ import java.nio.file.Paths; import org.junit.jupiter.api.Test; import org.springframework.http.MediaType; -import org.springframework.mock.web.MockMultipartFile; -import org.springframework.test.web.servlet.request.MockMvcRequestBuilders; import org.apache.tika.server.IntegrationTestBase; @@ -55,7 +53,8 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(jsonContent)) + .content(jsonContent) + .header("Content-Disposition", "attachment; filename=\"test.json\"")) .andExpect(status().isOk()) .andExpect(content().string("application/json")); } @@ -101,11 +100,12 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { public void testDetectEmptyContent() throws Exception { byte[] emptyContent = new byte[0]; + // Empty content is treated as missing request body by Spring, so expect 400 mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(emptyContent)) - .andExpect(status().isOk()) - .andExpect(content().string("application/octet-stream")); + .content(emptyContent) + .header("Content-Disposition", "attachment; filename=\"empty.txt\"")) + .andExpect(status().isBadRequest()); } @Test @@ -126,9 +126,10 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { byte[] textContent = "This is a test file.".getBytes(StandardCharsets.UTF_8); // Test that filename hints help with detection - mockMvc.perform(MockMvcRequestBuilders.multipart("/detect/stream") - .file(new MockMultipartFile("file", "test.txt", - MediaType.TEXT_PLAIN_VALUE, textContent))) + mockMvc.perform(put("/detect/stream") + .contentType(MediaType.APPLICATION_OCTET_STREAM) + .content(textContent) + .header("Content-Disposition", "attachment; filename=\"test.txt\"")) .andExpect(status().isOk()) .andExpect(content().string("text/plain")); } @@ -139,9 +140,10 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(jsContent)) + .content(jsContent) + .header("Content-Disposition", "attachment; filename=\"test.js\"")) .andExpect(status().isOk()) - .andExpect(content().string("application/javascript")); + .andExpect(content().string("text/javascript")); } @Test @@ -150,7 +152,8 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(cssContent)) + .content(cssContent) + .header("Content-Disposition", "attachment; filename=\"style.css\"")) .andExpect(status().isOk()) .andExpect(content().string("text/css")); } @@ -175,8 +178,7 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { public void testDetectNullContent() throws Exception { // Test with no content - should return bad request mockMvc.perform(put("/detect/stream")) - .andExpect(status().isBadRequest()) - .andExpect(content().string("No document provided")); + .andExpect(status().isBadRequest()); } @Test @@ -238,10 +240,10 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(markdownContent)) + .content(markdownContent) + .header("Content-Disposition", "attachment; filename=\"test.md\"")) .andExpect(status().isOk()) - // Markdown might be detected as text/plain since it's text-based - .andExpect(content().string("text/plain")); + .andExpect(content().string("text/x-web-markdown")); } @Test @@ -251,7 +253,8 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(csvContent)) + .content(csvContent) + .header("Content-Disposition", "attachment; filename=\"test.csv\"")) .andExpect(status().isOk()) .andExpect(content().string("text/csv")); } @@ -311,7 +314,8 @@ public class DetectorControllerIntegrationTest extends IntegrationTestBase { mockMvc.perform(put("/detect/stream") .contentType(MediaType.APPLICATION_OCTET_STREAM) - .content(jsonContent)) + .content(jsonContent) + .header("Content-Disposition", "attachment; filename=\"test.json\"")) .andExpect(status().isOk()) .andExpect(content().string("application/json")); } diff --git a/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/EvalControllerIntegrationTest.java b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/EvalControllerIntegrationTest.java new file mode 100644 index 000000000..c0720a6f0 --- /dev/null +++ b/tika-server/tika-server-spring/src/test/java/org/apache/tika/server/controller/EvalControllerIntegrationTest.java @@ -0,0 +1,403 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.controller; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.put; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.springframework.http.MediaType; +import org.springframework.test.web.servlet.MvcResult; + +import org.apache.tika.server.IntegrationTestBase; + +/** + * Integration tests for EvalController. + * Tests text profiling and comparison endpoints using TikaEval framework. + */ +public class EvalControllerIntegrationTest extends IntegrationTestBase { + + private final ObjectMapper objectMapper = new ObjectMapper(); + + @Test + public void testBasicProfile() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "1"); + request.put("text", "the quick brown fox jumped qwertyuiop"); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Verify token count - based on original test expectations + Integer numTokens = (Integer) results.get("tika-eval:numTokens"); + assertEquals(6, numTokens.intValue()); + + Object oovObj = results.get("tika-eval:oov"); + Double oov = ((Number) oovObj).doubleValue(); + assertEquals(0.166, oov, 0.01); + + // Verify language detection + String language = (String) results.get("tika-eval:lang"); + assertNotNull(language); + } + + @Test + public void testBasicCompare() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "1"); + request.put("textA", "the quick brown fox jumped qwertyuiop"); + request.put("textB", "the the the fast brown dog jumped qwertyuiop"); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/compare") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Verify text A statistics - based on original test expectations + Integer numTokensA = (Integer) results.get("tika-eval:numTokensA"); + assertEquals(6, numTokensA.intValue()); + + Object oovAObj = results.get("tika-eval:oovA"); + Double oovA = ((Number) oovAObj).doubleValue(); + assertEquals(0.166, oovA, 0.01); + + String languageA = (String) results.get("tika-eval:langA"); + assertNotNull(languageA); + + // Verify similarity metrics - based on original test expectations + Object diceObj = results.get("tika-eval:dice"); + Double dice = ((Number) diceObj).doubleValue(); + assertEquals(0.666, dice, 0.01); + + Object overlapObj = results.get("tika-eval:overlap"); + Double overlap = ((Number) overlapObj).doubleValue(); + assertEquals(0.571, overlap, 0.01); + } + + @Test + public void testProfileWithTimeout() throws Exception { + Map<String, Object> request = new HashMap<>(); + request.put("id", "timeout-test"); + request.put("text", "short text for testing timeout functionality"); + request.put("timeoutMillis", 30000); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Verify basic statistics for the text + assertNotNull(results.get("tika-eval:numTokens")); + assertNotNull(results.get("tika-eval:numUniqueTokens")); + } + + @Test + public void testCompareWithTimeout() throws Exception { + Map<String, Object> request = new HashMap<>(); + request.put("id", "compare-timeout-test"); + request.put("textA", "hello world"); + request.put("textB", "hello universe"); + request.put("timeoutMillis", 30000); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/compare") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Verify both texts have been processed + assertNotNull(results.get("tika-eval:numTokensA")); + assertNotNull(results.get("tika-eval:numTokensB")); + + // Verify similarity metrics are present + assertNotNull(results.get("tika-eval:dice")); + assertNotNull(results.get("tika-eval:overlap")); + } + + @Test + public void testProfileEmptyText() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "empty-test"); + request.put("text", ""); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Empty text should have zero tokens + assertEquals(0, (Integer) results.get("tika-eval:numTokens")); + assertEquals(0, (Integer) results.get("tika-eval:numUniqueTokens")); + assertEquals(0, (Integer) results.get("tika-eval:numAlphaTokens")); + } + + @Test + public void testProfileLongText() throws Exception { + // Create a longer text sample for testing + StringBuilder longText = new StringBuilder(); + for (int i = 0; i < 100; i++) { + longText.append("This is sentence number ").append(i).append(". "); + } + + Map<String, String> request = new HashMap<>(); + request.put("id", "long-text-test"); + request.put("text", longText.toString()); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Verify we get reasonable statistics for long text + Integer numTokens = (Integer) results.get("tika-eval:numTokens"); + assertNotNull(numTokens); + // Should have many tokens for this long text + assert(numTokens > 100); + + assertNotNull(results.get("tika-eval:numUniqueTokens")); + } + + @Test + public void testCompareIdenticalTexts() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "identical-test"); + request.put("textA", "the quick brown fox"); + request.put("textB", "the quick brown fox"); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/compare") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Identical texts should have high similarity scores + Double dice = (Double) results.get("tika-eval:dice"); + assertNotNull(dice); + // Dice coefficient should be close to 1.0 for identical texts + assert(dice > 0.9); + + // Token counts should be identical + assertEquals(results.get("tika-eval:numTokensA"), results.get("tika-eval:numTokensB")); + } + + @Test + public void testCompareCompletelyDifferentTexts() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "different-test"); + request.put("textA", "apple banana cherry"); + request.put("textB", "dog elephant frog"); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/compare") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Completely different texts should have low similarity scores + Double dice = (Double) results.get("tika-eval:dice"); + assertNotNull(dice); + // Dice coefficient should be close to 0.0 for completely different texts + assert(dice < 0.1); + + Double overlap = (Double) results.get("tika-eval:overlap"); + assertNotNull(overlap); + assert(overlap < 0.1); + } + + @Test + public void testInvalidJsonRequest() throws Exception { + String invalidJson = "{invalid json}"; + + mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(invalidJson.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isBadRequest()); + } + + @Test + public void testMissingRequiredFields() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "missing-text"); + // Missing "text" field + + String jsonRequest = objectMapper.writeValueAsString(request); + + mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isBadRequest()); + } + + @Test + public void testMissingRequiredFieldsCompare() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "missing-text"); + request.put("textA", "some text"); + // Missing "textB" field + + String jsonRequest = objectMapper.writeValueAsString(request); + + mockMvc.perform(put("/eval/compare") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isBadRequest()); + } + + @Test + public void testMultipleProfileRequests() throws Exception { + // Test that the server can handle multiple concurrent profile requests + Map<String, String> request1 = new HashMap<>(); + request1.put("id", "multi-test-1"); + request1.put("text", "First test text for profiling."); + + Map<String, String> request2 = new HashMap<>(); + request2.put("id", "multi-test-2"); + request2.put("text", "Second test text for profiling analysis."); + + Map<String, String> request3 = new HashMap<>(); + request3.put("id", "multi-test-3"); + request3.put("text", "Third test text with different content for evaluation."); + + String jsonRequest1 = objectMapper.writeValueAsString(request1); + String jsonRequest2 = objectMapper.writeValueAsString(request2); + String jsonRequest3 = objectMapper.writeValueAsString(request3); + + // Execute multiple requests to test server status tracking + mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest1.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()); + + mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest2.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()); + + mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest3.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()); + } + + @Test + public void testSpecialCharactersInText() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "special-chars-test"); + request.put("text", "Hello, world! This text contains special characters: @#$%^&*()_+{}|:<>?"); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Should handle special characters gracefully + assertNotNull(results.get("tika-eval:numTokens")); + assertNotNull(results.get("tika-eval:numAlphaTokens")); + } + + @Test + public void testUnicodeText() throws Exception { + Map<String, String> request = new HashMap<>(); + request.put("id", "unicode-test"); + request.put("text", "Héllo wørld! This is tëst tëxt with ūnïcōdē characters: 你好世界 🌍"); + + String jsonRequest = objectMapper.writeValueAsString(request); + + MvcResult result = mockMvc.perform(put("/eval/profile") + .contentType(MediaType.APPLICATION_JSON) + .content(jsonRequest.getBytes(StandardCharsets.UTF_8))) + .andExpect(status().isOk()) + .andReturn(); + + String responseBody = result.getResponse().getContentAsString(); + Map<String, Object> results = objectMapper.readValue(responseBody, + new TypeReference<Map<String, Object>>() {}); + + // Should handle Unicode characters gracefully + assertNotNull(results.get("tika-eval:numTokens")); + } +} diff --git a/tika-server/tika-server-spring/src/test/resources/test-tika-config.xml b/tika-server/tika-server-spring/src/test/resources/test-tika-config.xml new file mode 100644 index 000000000..2e2597ad4 --- /dev/null +++ b/tika-server/tika-server-spring/src/test/resources/test-tika-config.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <!-- Basic configuration for integration tests --> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + </parsers> + + <detectors> + <detector class="org.apache.tika.detect.DefaultDetector"/> + <detector class="org.apache.tika.mime.MimeTypes"/> + </detectors> + + <!-- Server-specific configuration --> + <service-loader> + <dynamic>true</dynamic> + </service-loader> + + <!-- Metadata configuration --> + <metadataFilter class="org.apache.tika.metadata.filter.MetadataFilter"> + <!-- Allow all metadata for tests --> + </metadataFilter> +</properties>
