This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new b822e8eb3 TIKA-4574 -- rm SentimentAnalysisParser (#2455)
b822e8eb3 is described below
commit b822e8eb372ec61ce2a1d96a28166a120d7ee68d
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 16 11:08:55 2025 -0500
TIKA-4574 -- rm SentimentAnalysisParser (#2455)
---
.../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 49 --------
.../parser/sentiment/SentimentAnalysisParser.java | 137 ---------------------
.../sentiment/SentimentAnalysisParserTest.java | 84 -------------
.../configs/tika-config-sentiment-opennlp-cat.json | 10 --
.../configs/tika-config-sentiment-opennlp.json | 10 --
5 files changed, 290 deletions(-)
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
index 997162225..1424f537f 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml
@@ -101,55 +101,6 @@
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</dependency>
- <!-- sentiment parser -->
- <dependency>
- <groupId>edu.usc.ir</groupId>
- <artifactId>sentiment-analysis-parser</artifactId>
- <version>0.1</version>
- <exclusions>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-translate</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-langdetect</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-serialization</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-batch</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>jul-to-slf4j</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- </exclusion>
- <exclusion>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java
deleted file mode 100644
index 16a4b81ab..000000000
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.sentiment;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URL;
-import java.util.Collections;
-import java.util.Set;
-
-import opennlp.tools.sentiment.SentimentME;
-import opennlp.tools.sentiment.SentimentModel;
-import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.config.Initializable;
-import org.apache.tika.config.TikaComponent;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-
-/**
- * This parser classifies documents based on the sentiment of document.
- * The classifier is powered by Apache OpenNLP's Maximum Entropy Classifier
- */
-@TikaComponent(spi = false)
-public class SentimentAnalysisParser implements Parser, Initializable {
-
- public static final String DEF_MODEL =
-
"https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin";
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("sentiment"));
- private static final Logger LOG =
LoggerFactory.getLogger(SentimentAnalysisParser.class);
- private SentimentME classifier;
-
- /**
- * Path to model path. Default is {@value DEF_MODEL}
- * <p>
- * <br/>
- * The path could be one of the following:
- * <ul>
- * <li>a HTTP or HTTPS URL (Not recommended for production use since no
caching is
- * implemented) </li>
- * <li>an absolute or relative path on local file system (recommended for
production use in
- * standalone mode)</li>
- * <li>a relative path known to class loader (Especially useful in
distributed environments,
- * recommended for advanced users</li>
- * </ul>
- * Note: on conflict: the model from local file system gets the priority
- * over classpath
- */
- private String modelPath = DEF_MODEL;
-
- @Override
- public void initialize() throws TikaConfigException {
- LOG.debug("Initializing...");
- if (modelPath == null) {
- throw new TikaConfigException("Parameter 'modelPath' is required
but it is not set");
- }
- try {
- URL resolvedUrl = null;
- if (modelPath.startsWith("http://") ||
modelPath.startsWith("https://")) {
- resolvedUrl = new URL(modelPath);
- } else {
- resolvedUrl =
getClass().getClassLoader().getResource(modelPath);
- File file = new File(modelPath);
- if (file.exists()) { // file on filesystem gets higher priority
- resolvedUrl = file.toURI().toURL();
- }
- }
- if (resolvedUrl == null) {
- throw new TikaConfigException("Model doesn't exists :" +
modelPath);
- }
- LOG.info("Sentiment Model is at {}", resolvedUrl);
- long st = System.currentTimeMillis();
- SentimentModel model = new SentimentModel(resolvedUrl);
- long time = System.currentTimeMillis() - st;
- LOG.debug("time taken to load model {}", time);
- classifier = new SentimentME(model);
- } catch (Exception e) {
- LOG.warn("Failed to load sentiment model from {}" + modelPath);
- throw new TikaConfigException(e.getMessage(), e);
- }
- }
-
- /**
- * Returns the types supported
- *
- * @param context the parse context
- * @return the set of types supported
- */
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- /**
- * Performs the parse
- *
- * @param stream the input
- * @param handler the content handler
- * @param metadata the metadata passed
- * @param context the context for the parser
- */
- @Override
- public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
- ParseContext context) throws IOException, SAXException,
TikaException {
- if (classifier == null) {
- LOG.warn(getClass().getSimpleName() + " is not configured
properly.");
- return;
- }
- String inputString = IOUtils.toString(tis, "UTF-8");
- String sentiment = classifier.predict(inputString);
- metadata.add("Sentiment", sentiment);
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java
deleted file mode 100644
index 8380bcc81..000000000
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.sentiment;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-import java.io.IOException;
-import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.TikaLoader;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-
-/**
- * Test case for {@link SentimentAnalysisParser}
- */
-public class SentimentAnalysisParserTest extends TikaTest {
-
- @Test
- public void endToEndTest() throws Exception {
- Parser parser = getParser("tika-config-sentiment-opennlp.json");
- if (parser == null) {
- return;
- }
-
- String text = "What a wonderful thought it is that" +
- " some of the best days of our lives haven't happened yet.";
- Metadata md =
getXML(TikaInputStream.get(text.getBytes(StandardCharsets.UTF_8)),
- parser, new Metadata()).metadata;
- String sentiment = md.get("Sentiment");
- assertNotNull(sentiment);
- assertEquals("positive", sentiment);
- }
-
- @Test
- public void testCategorical() throws Exception {
- Parser parser = getParser("tika-config-sentiment-opennlp-cat.json");
- if (parser == null) {
- return;
- }
- String text = "Whatever, I need some cooling off time!";
- Metadata md =
getXML(TikaInputStream.get(text.getBytes(StandardCharsets.UTF_8)),
- parser, new Metadata()).metadata;
- String sentiment = md.get("Sentiment");
- assertNotNull(sentiment);
- assertEquals("angry", sentiment);
- }
-
- private Parser getParser(String configJson) throws TikaException,
IOException, URISyntaxException {
- try {
- return TikaLoader.load(
- getConfigPath(SentimentAnalysisParserTest.class,
configJson))
- .loadAutoDetectParser();
- } catch (TikaConfigException e) {
- //if can't connect to pull sentiment model...ignore test
- if (e.getCause() instanceof IOException) {
- return null;
- }
- throw e;
- }
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config-sentiment-opennlp-cat.json
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config-sentiment-opennlp-cat.json
deleted file mode 100644
index 5e362efc0..000000000
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config-sentiment-opennlp-cat.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
- "parsers": [
- {
- "sentiment-analysis-parser": {
- "_mime-include": ["text/plain", "application/sentiment"],
- "modelPath":
"https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/ht-sentiment-categ.bin"
- }
- }
- ]
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config-sentiment-opennlp.json
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config-sentiment-opennlp.json
deleted file mode 100644
index eefb16d6f..000000000
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/configs/tika-config-sentiment-opennlp.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
- "parsers": [
- {
- "sentiment-analysis-parser": {
- "_mime-include": ["text/plain", "application/sentiment"],
- "modelPath":
"https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin"
- }
- }
- ]
-}