This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 344a7ea6a TIKA-4555 -- rm tika-server-eval (#2456)
344a7ea6a is described below
commit 344a7ea6a756bdc38f9606aa01fc60376c4839ba
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 16 11:08:38 2025 -0500
TIKA-4555 -- rm tika-server-eval (#2456)
---
tika-server/pom.xml | 1 -
tika-server/tika-server-eval/pom.xml | 59 -------
.../apache/tika/server/eval/TikaEvalResource.java | 190 ---------------------
...he.tika.server.core.resource.TikaServerResource | 15 --
.../tika/server/eval/TikaEvalResourceTest.java | 169 ------------------
5 files changed, 434 deletions(-)
diff --git a/tika-server/pom.xml b/tika-server/pom.xml
index e001dd9e1..7a76de956 100644
--- a/tika-server/pom.xml
+++ b/tika-server/pom.xml
@@ -20,7 +20,6 @@
<module>tika-server-core</module>
<module>tika-server-standard</module>
<module>tika-server-client</module>
- <module>tika-server-eval</module>
</modules>
<parent>
diff --git a/tika-server/tika-server-eval/pom.xml
b/tika-server/tika-server-eval/pom.xml
deleted file mode 100644
index ddc6505c9..000000000
--- a/tika-server/tika-server-eval/pom.xml
+++ /dev/null
@@ -1,59 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <parent>
- <artifactId>tika-server</artifactId>
- <groupId>org.apache.tika</groupId>
- <version>4.0.0-SNAPSHOT</version>
- </parent>
- <modelVersion>4.0.0</modelVersion>
-
- <artifactId>tika-server-eval</artifactId>
- <name>Apache Tika server tika-eval handler</name>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-eval-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-server-core</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <configuration>
- <archive>
- <manifestEntries>
-
<Automatic-Module-Name>org.apache.tika.server.eval</Automatic-Module-Name>
- </manifestEntries>
- </archive>
- </configuration>
- </plugin>
-
- </plugins>
- </build>
-</project>
\ No newline at end of file
diff --git
a/tika-server/tika-server-eval/src/main/java/org/apache/tika/server/eval/TikaEvalResource.java
b/tika-server/tika-server-eval/src/main/java/org/apache/tika/server/eval/TikaEvalResource.java
deleted file mode 100644
index 73430f292..000000000
---
a/tika-server/tika-server-eval/src/main/java/org/apache/tika/server/eval/TikaEvalResource.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.server.eval;
-
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.LANGUAGE;
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE;
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.NUM_ALPHA_TOKENS;
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.NUM_TOKENS;
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS;
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS;
-import static
org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter.OUT_OF_VOCABULARY;
-
-import java.io.BufferedReader;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import jakarta.ws.rs.Consumes;
-import jakarta.ws.rs.PUT;
-import jakarta.ws.rs.Path;
-import jakarta.ws.rs.Produces;
-
-import org.apache.tika.eval.core.langid.LanguageIDWrapper;
-import org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter;
-import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
-import org.apache.tika.eval.core.textstats.CommonTokens;
-import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
-import org.apache.tika.eval.core.textstats.TextStatsCalculator;
-import org.apache.tika.eval.core.tokens.CommonTokenResult;
-import org.apache.tika.eval.core.tokens.ContrastStatistics;
-import org.apache.tika.eval.core.tokens.TokenContraster;
-import org.apache.tika.eval.core.tokens.TokenCounts;
-import org.apache.tika.language.detect.LanguageResult;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.server.core.ServerStatus;
-import org.apache.tika.server.core.ServerStatusResource;
-import org.apache.tika.server.core.resource.TikaServerResource;
-import org.apache.tika.utils.StringUtils;
-
-@Path("/eval")
-public class TikaEvalResource implements TikaServerResource,
ServerStatusResource {
-
- public static final String TEXT = "text";
- public static final String TEXT_A = "textA";
- public static final String TEXT_B = "textB";
- public static final String ID = "id";
-
- public static final Property DICE = Property.externalReal(
- TikaEvalMetadataFilter.TIKA_EVAL_NS + "dice");
-
- public static final Property OVERLAP = Property.externalReal(
- TikaEvalMetadataFilter.TIKA_EVAL_NS + "overlap");
-
- private ServerStatus serverStatus;
- public static final long DEFAULT_TIMEOUT_MILLIS = 60000;
-
- static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR;
-
- static {
- List<TextStatsCalculator> calcs = new ArrayList<>();
- calcs.add(new BasicTokenCountStatsCalculator());
- calcs.add(new CommonTokens());
- TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs);
- }
-
- @PUT
- @Consumes("application/json")
- @Produces("application/json")
- @Path("compare")
- public Map<String, Object> compare(InputStream is) throws Exception {
- JsonNode node = null;
- try (BufferedReader reader = new BufferedReader(
- new InputStreamReader(is, StandardCharsets.UTF_8))) {
- node = new ObjectMapper().readTree(reader);
- }
- String id = node.get(ID).asText();
- String textA = node.get(TEXT_A).asText();
- String textB = node.get(TEXT_B).asText();
- long timeoutMillis = node.has("timeoutMillis") ?
node.get("timeoutMillis").asLong() :
- DEFAULT_TIMEOUT_MILLIS;
- return compareText(id, textA, textB, timeoutMillis);
- }
-
- @PUT
- @Consumes("application/json")
- @Produces("application/json")
- @Path("profile")
- public Map<String, Object> profile(InputStream is) throws Exception {
- JsonNode node = null;
- try (BufferedReader reader = new BufferedReader(
- new InputStreamReader(is, StandardCharsets.UTF_8))) {
- node = new ObjectMapper().readTree(reader);
- }
- String id = node.get(ID).asText();
- String text = node.get(TEXT).asText();
- long timeoutMillis = node.has("timeoutMillis") ?
node.get("timeoutMillis").asLong() :
- DEFAULT_TIMEOUT_MILLIS;
- return profile(id, text, timeoutMillis);
- }
-
- private Map<String, Object> profile(String id, String text, long
timeoutMillis) {
-
- Map<String, Object> stats = new HashMap<>();
- long taskId = serverStatus.start(ServerStatus.TASK.PARSE, id,
timeoutMillis);
- try {
- profile(StringUtils.EMPTY, text, stats);
- } finally {
- serverStatus.complete(taskId);
- }
- return stats;
- }
-
-
- private Map<String, Object> compareText(String id, String textA, String
textB, long timeoutMillis) {
-
- Map<String, Object> stats = new HashMap<>();
- long taskId = serverStatus.start(ServerStatus.TASK.PARSE, id,
timeoutMillis);
- try {
- TokenCounts tokensA = profile("A", textA, stats);
- TokenCounts tokensB = profile("B", textB, stats);
- TokenContraster tokenContraster = new TokenContraster();
- ContrastStatistics contrastStatistics =
- tokenContraster.calculateContrastStatistics(tokensA,
tokensB);
- reportContrastStats(contrastStatistics, stats);
- } finally {
- serverStatus.complete(taskId);
- }
- return stats;
- }
-
- private void reportContrastStats(ContrastStatistics contrastStatistics,
- Map<String, Object> stats) {
- stats.put(DICE.getName(), contrastStatistics.getDiceCoefficient());
- stats.put(OVERLAP.getName(), contrastStatistics.getOverlap());
- //TODO, add topNMore, topNUnique
- }
-
- private TokenCounts profile(String suffix, String content, Map<String,
Object> stats) {
- Map<Class, Object> results = TEXT_STATS_CALCULATOR.calculate(content);
-
- TokenCounts tokenCounts = (TokenCounts)
results.get(BasicTokenCountStatsCalculator.class);
- stats.put(NUM_TOKENS.getName() + suffix, tokenCounts.getTotalTokens());
- stats.put(NUM_UNIQUE_TOKENS.getName() + suffix,
tokenCounts.getTotalUniqueTokens());
-
-
- //common token results
- CommonTokenResult commonTokenResult = (CommonTokenResult)
results.get(CommonTokens.class);
- stats.put(NUM_ALPHA_TOKENS.getName() + suffix,
commonTokenResult.getAlphabeticTokens());
- stats.put(NUM_UNIQUE_ALPHA_TOKENS.getName() + suffix,
commonTokenResult.getUniqueAlphabeticTokens());
- if (commonTokenResult.getAlphabeticTokens() > 0) {
- stats.put(OUT_OF_VOCABULARY.getName() + suffix,
commonTokenResult.getOOV());
- } else {
- stats.put(OUT_OF_VOCABULARY.getName() + suffix, -1.0f);
- }
-
- //languages
- List<LanguageResult> probabilities =
- (List<LanguageResult>) results.get(LanguageIDWrapper.class);
- if (probabilities.size() > 0) {
- stats.put(LANGUAGE.getName() + suffix,
probabilities.get(0).getLanguage());
- stats.put(LANGUAGE_CONFIDENCE.getName() + suffix,
probabilities.get(0).getRawScore());
- }
- return tokenCounts;
- }
-
- @Override
- public void setServerStatus(ServerStatus serverStatus) {
- this.serverStatus = serverStatus;
- }
-}
diff --git
a/tika-server/tika-server-eval/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
b/tika-server/tika-server-eval/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
deleted file mode 100644
index def7e1af5..000000000
---
a/tika-server/tika-server-eval/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
+++ /dev/null
@@ -1,15 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-org.apache.tika.server.eval.TikaEvalResource
\ No newline at end of file
diff --git
a/tika-server/tika-server-eval/src/test/java/org/apache/tika/server/eval/TikaEvalResourceTest.java
b/tika-server/tika-server-eval/src/test/java/org/apache/tika/server/eval/TikaEvalResourceTest.java
deleted file mode 100644
index 98ce13c02..000000000
---
a/tika-server/tika-server-eval/src/test/java/org/apache/tika/server/eval/TikaEvalResourceTest.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.server.eval;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.core.type.TypeReference;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import jakarta.ws.rs.core.Response;
-import org.apache.cxf.binding.BindingFactoryManager;
-import org.apache.cxf.endpoint.Server;
-import org.apache.cxf.jaxrs.JAXRSBindingFactory;
-import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
-import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
-import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
-import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter;
-import org.apache.tika.server.core.ProduceTypeResourceComparator;
-import org.apache.tika.server.core.ServerStatus;
-import org.apache.tika.server.core.TikaServerConfig;
-import org.apache.tika.server.core.writer.JSONObjWriter;
-
-public class TikaEvalResourceTest {
-
- protected static final String END_POINT =
- "http://localhost:" + TikaServerConfig.DEFAULT_PORT;
-
- protected static final String COMPARE_END_POINT = END_POINT +
"/eval/compare";
- protected static final String PROFILE_END_POINT = END_POINT +
"/eval/profile";
- protected static Server SERVER;
-
- ObjectMapper objectMapper = new ObjectMapper();
-
- @BeforeAll
- public static void setUp() throws Exception {
- ServerStatus serverStatus = new ServerStatus();
- JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
- //set compression interceptors
- sf.setOutInterceptors(Collections.singletonList(new
GZIPOutInterceptor()));
- sf.setInInterceptors(Collections.singletonList(new
GZIPInInterceptor()));
-
- setUpResources(sf, serverStatus);
- setUpProviders(sf);
- sf.setAddress(END_POINT + "/");
- sf.setResourceComparator(new ProduceTypeResourceComparator());
-
- BindingFactoryManager manager =
sf.getBus().getExtension(BindingFactoryManager.class);
-
- JAXRSBindingFactory factory = new JAXRSBindingFactory();
- factory.setBus(sf.getBus());
-
- manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID,
factory);
- SERVER = sf.create();
- }
-
- @AfterAll
- public static void tearDown() throws Exception {
- SERVER.stop();
- SERVER.destroy();
- }
-
- protected static void setUpResources(JAXRSServerFactoryBean sf,
ServerStatus serverStatus) {
- sf.setResourceClasses(TikaEvalResource.class);
- TikaEvalResource tikaEvalResource = new TikaEvalResource();
- tikaEvalResource.setServerStatus(serverStatus);
- sf.setResourceProvider(TikaEvalResource.class,
- new SingletonResourceProvider(tikaEvalResource));
- }
-
- protected static void setUpProviders(JAXRSServerFactoryBean sf) {
- List<Object> providers = new ArrayList<>();
- providers.add(new JSONObjWriter());
- sf.setProviders(providers);
- }
-
- @Test
- public void testBasicProfile() throws Exception {
- Map<String, String> request = new HashMap<>();
- request.put(TikaEvalResource.ID, "1");
- request.put(TikaEvalResource.TEXT, "the quick brown fox jumped
qwertyuiop");
- Response response = profile(request);
- Map<String, Object> results = deserialize(response);
- assertEquals(6,
(int)results.get(TikaEvalMetadataFilter.NUM_TOKENS.getName()));
- assertEquals(0.166,
(double)results.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY.getName()),
- 0.01);
- assertEquals("eng",
(String)results.get(TikaEvalMetadataFilter.LANGUAGE.getName()));
- }
-
- @Test
- public void testBasicCompare() throws Exception {
- Map<String, String> request = new HashMap<>();
- request.put(TikaEvalResource.ID, "1");
- request.put(TikaEvalResource.TEXT_A, "the quick brown fox jumped
qwertyuiop");
- request.put(TikaEvalResource.TEXT_B, "the the the fast brown dog
jumped qwertyuiop");
- Response response = compare(request);
- Map<String, Object> results = deserialize(response);
- assertEquals(6,
- (int)results.get(TikaEvalMetadataFilter.NUM_TOKENS.getName() +
"A"));
- assertEquals(0.166,
-
(double)results.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY.getName() + "A"),
- 0.01);
- assertEquals("eng",
results.get(TikaEvalMetadataFilter.LANGUAGE.getName() + "A"));
-
- assertEquals(0.666,
(double)results.get(TikaEvalResource.DICE.getName()), 0.01);
- assertEquals(0.571,
(double)results.get(TikaEvalResource.OVERLAP.getName()), 0.01);
- }
-
- private Map<String, Object> deserialize(Response response) throws
IOException {
- TypeReference<HashMap<String, Object>> typeRef
- = new TypeReference<HashMap<String, Object>>() {};
- try (BufferedReader reader =
- new BufferedReader(
- new
InputStreamReader((InputStream)response.getEntity(),
- StandardCharsets.UTF_8))) {
- return objectMapper.readValue(reader, typeRef);
- }
- }
-
- private Response profile(Map<String, String> request) throws
JsonProcessingException {
-
- String jsonRequest = objectMapper//.writerWithDefaultPrettyPrinter()
- .writeValueAsString(request);
- return WebClient.create(PROFILE_END_POINT)
- .type("application/json")
- .accept("application/json")
- .put(jsonRequest.getBytes(StandardCharsets.UTF_8));
- }
-
- private Response compare(Map<String, String> request) throws
JsonProcessingException {
-
- String jsonRequest = objectMapper//.writerWithDefaultPrettyPrinter()
- .writeValueAsString(request);
- return WebClient.create(COMPARE_END_POINT)
- .type("application/json")
- .accept("application/json")
- .put(jsonRequest.getBytes(StandardCharsets.UTF_8));
- }
-}