(opennlp-sandbox) branch main updated: Modernize opennlp-similarity component (#387)

mawiesne Sat, 29 Nov 2025 07:42:32 -0800

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git



The following commit(s) were added to refs/heads/main by this push:
     new e8610a7  Modernize opennlp-similarity component (#387)
e8610a7 is described below

commit e8610a724a00ab4a829102c33c6fca46004e5f6e
Author: Martin Wiesner <[email protected]>
AuthorDate: Sat Nov 29 16:42:19 2025 +0100

    Modernize opennlp-similarity component (#387)
---
 opennlp-similarity/pom.xml                         | 19 +++++--
 .../tools/doc_classifier/DocClassifier.java        | 40 +++++----------
 .../java/opennlp/tools/fca/BasicLevelMetrics.java  | 19 +++----
 .../java/opennlp/tools/fca/ConceptLattice.java     |  5 +-
 .../JSMLearnerOnLatticeWithDeduction.java          | 43 ++++++----------
 .../apps/solr/SyntGenRequestHandler.java           | 41 +++++++--------
 .../tools/textsimilarity/ParseTreeChunk.java       | 60 ++++++++++++----------
 7 files changed, 103 insertions(+), 124 deletions(-)

diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml
index 3fcbcbe..06a5360 100644
--- a/opennlp-similarity/pom.xml
+++ b/opennlp-similarity/pom.xml
@@ -31,8 +31,10 @@
     <jakarta.mail.version>2.1.4</jakarta.mail.version>
     <org.json.version>20250517</org.json.version>
 
+    <lucene.version>9.12.3</lucene.version>
+    <solr.version>9.10.0</solr.version>
     <tika.version>3.2.3</tika.version>
-    <solr.version>8.11.4</solr.version>
+    
     <docx4j.version>11.5.7</docx4j.version>
     <dl4j.version>1.0.0-M2.1</dl4j.version>
     <hdf5.version>1.14.3-1.5.10</hdf5.version>
@@ -101,12 +103,12 @@
       <scope>runtime</scope>
     </dependency>
     <!-- End model resources -->
-    
+
     <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <scope>runtime</scope>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-collections4</artifactId>
     </dependency>
+
     <dependency>
       <groupId>jakarta.xml.bind</groupId>
       <artifactId>jakarta.xml.bind-api</artifactId>
@@ -158,6 +160,13 @@
       </exclusions>
     </dependency>
 
+    <!-- required for solr in the chosen version -->
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-queryparser</artifactId>
+      <version>${lucene.version}</version>
+    </dependency>
+
     <dependency>
       <groupId>edu.mit</groupId>
       <artifactId>jverbnet</artifactId>
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
index 784ebb2..5f50277 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
@@ -49,7 +49,9 @@ public class DocClassifier {
 
        private static final Logger LOGGER = 
LoggerFactory.getLogger(DocClassifier.class);
        public static final String DOC_CLASSIFIER_KEY = "doc_class";
+  public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map";
        public static final String RESOURCE_DIR = null;
+
        private Map<String, Float> scoredClasses;
        
        public static final Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f;
@@ -66,22 +68,10 @@ public class DocClassifier {
        // to accumulate classif results
        private final CountItemsList<String> localCats = new CountItemsList<>();
        private static final int MAX_TOKENS_TO_FORM = 30;
-       private final String CAT_COMPUTING = "computing";
-       public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map";
-       private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60; // if
-       // sentence
-       // is
-       // shorter,
-       // should
-       // not
-       // be
-       // used
-       // for
-       // classification
-       private static final int MIN_CHARS_IN_QUERY = 30; // if combination of
-       // keywords are shorter,
-       // should not be used
-       // for classification
+  // if sentence is shorter, should not be used for classification
+       private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60;
+  // if combination of keywords are shorter, should not be used for 
classification
+       private static final int MIN_CHARS_IN_QUERY = 30;
 
        // these are categories from the index
        public static final String[] CATEGORIES = new String[]
@@ -94,13 +84,13 @@ public class DocClassifier {
                        try {
                                indexDirectory = FSDirectory.open(new 
File(INDEX_PATH).toPath());
                        } catch (IOException e2) {
-                               LOGGER.error("problem opening index " + e2);
+        LOGGER.error("problem opening index {}", String.valueOf(e2));
                        }
                        try {
                                indexReader = 
DirectoryReader.open(indexDirectory);
                                indexSearcher = new IndexSearcher(indexReader);
                        } catch (IOException e2) {
-                               LOGGER.error("problem reading index \n" + e2);
+        LOGGER.error("problem reading index \n{}", String.valueOf(e2));
                        }
                }
        }
@@ -134,7 +124,7 @@ public class DocClassifier {
                } catch (IOException e1) {
                        LOGGER.error("problem searching index \n", e1);
                }
-               LOGGER.debug("Found " + hits.totalHits + " hits for " + 
queryStr);
+    LOGGER.debug("Found {} hits for {}", hits.totalHits, queryStr);
                int count = 0;
                
 
@@ -143,8 +133,7 @@ public class DocClassifier {
                        try {
                                doc = indexSearcher.doc(scoreDoc.doc);
                        } catch (IOException e) {
-                               LOGGER.error("Problem searching training set 
for classif \n"
-                                               + e);
+        LOGGER.error("Problem searching training set for classif \n{}", 
String.valueOf(e));
                                continue;
                        }
                        String flag = doc.get("class");
@@ -170,8 +159,7 @@ public class DocClassifier {
                                if (scoredClasses.get(key) > 
MIN_TOTAL_SCORE_FOR_CATEGORY)
                                        resultsAboveThresh.add(key);
                                else
-                                       LOGGER.debug("Too low score of " + 
scoredClasses.get(key)
-                                                       + " for category = " + 
key);
+          LOGGER.debug("Too low score of {} for category = {}", 
scoredClasses.get(key), key);
                        }
 
                        int len = resultsAboveThresh.size();
@@ -182,7 +170,7 @@ public class DocClassifier {
                        else
                                results = resultsAboveThresh;
                } catch (Exception e) {
-                       LOGGER.error("Problem aggregating search results\n" + 
e);
+      LOGGER.error("Problem aggregating search results\n{}", 
String.valueOf(e));
                }
                if (results.size() < 2)
                        return results;
@@ -262,9 +250,9 @@ public class DocClassifier {
                                        continue;
                                String query = formClassifQuery(sentence, 
MAX_TOKENS_TO_FORM);
                                classifResults = classifySentence(query);
-                               if (classifResults != null && 
classifResults.size() > 0) {
+                               if (classifResults != null && 
!classifResults.isEmpty()) {
                                        localCats.addAll(classifResults);
-                                       LOGGER.debug(sentence + " =>  " + 
classifResults);
+          LOGGER.debug("{} => {}", sentence, classifResults);
                                }
                        }
                } catch (Exception e) {
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java 
b/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
index 668e0ab..cc619cc 100755
--- a/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
@@ -19,16 +19,16 @@ package opennlp.tools.fca;
 
 import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
 
 public class BasicLevelMetrics {
        
-       final ConceptLattice cl;
-       ArrayList<ArrayList<Integer>> attributesExtent;
-       ArrayList<ArrayList<Integer>> objectsIntent  = null; 
-       ArrayList<Integer> attributes = null; 
+       private final ConceptLattice cl;
+       private List<ArrayList<Integer>> attributesExtent;
+  private List<Integer> attributes = null;
        private final double[][] objectsSimilarityJ;
        private final double [][] objectsSimilaritySMC;
 
@@ -41,7 +41,7 @@ public class BasicLevelMetrics {
        
        public void setUp(){
                attributesExtent = new ArrayList<>();
-               objectsIntent = new ArrayList<>();
+    List<ArrayList<Integer>> objectsIntent = new ArrayList<>();
                attributes = new ArrayList<>();
                
                for (int i=0;i<cl.attributeCount;i++){
@@ -66,15 +66,10 @@ public class BasicLevelMetrics {
                        objectsSimilarityJ[i][i] = 1;
                        objectsSimilaritySMC[i][i] = 1;
                }
-               
-               //System.out.println("J");
-               //System.out.println(Arrays.deepToString(objectsSimilarityJ));
-               //System.out.println("SMC");
-               //System.out.println(Arrays.deepToString(objectsSimilaritySMC));
 
        } 
         
-       //Utility functions for  Similarity approach (S)
+       // Utility functions for  Similarity approach (S)
        public double simSMC (ArrayList<Integer> intent1, 
ArrayList<Integer>intent2){
                int tp = (ListUtils.intersection(intent1,intent2)).size();
                ArrayList<Integer> fnlst = new ArrayList<>(this.attributes);
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java 
b/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
index 6d59154..5735b31 100755
--- a/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
@@ -26,7 +26,7 @@ import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Set;
 
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
 
 public class ConceptLattice {
        int objectCount;
@@ -88,7 +88,6 @@ public class ConceptLattice {
                }                               
        }
 
-       
        public int GetMaximalConcept(List<Integer> intent, int Generator) {
                boolean parentIsMaximal = true;
                while(parentIsMaximal) {
@@ -105,7 +104,7 @@ public class ConceptLattice {
        }
        
        public void AddExtentToAncestors(LinkedHashSet<Integer>extent, int 
curNode) {
-               if (conceptList.get(curNode).parents.size()>0){
+               if (!conceptList.get(curNode).parents.isEmpty()){
                        for (int parent : conceptList.get(curNode).parents){
                                conceptList.get(parent).addExtents(extent);
                                AddExtentToAncestors(extent, parent);
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
index d9c8b83..6b9f32d 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
@@ -22,7 +22,7 @@ import java.util.Arrays;
 import java.util.LinkedHashSet;
 import java.util.List;
 
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
 
 import 
opennlp.tools.parse_thicket.pattern_structure.LinguisticPatternStructure;
 import opennlp.tools.similarity.apps.utils.Pair;
@@ -31,13 +31,12 @@ import opennlp.tools.textsimilarity.ParseTreeChunk;
 public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{
        final List<JSMDecision> accumulatedJSMResults = new ArrayList<>();
 
-
-
-       public JSMDecision buildLearningModel(List<String> posTexts, 
List<String> negTexts, 
-                       String unknown, String[] separationKeywords){
-               psPos = new LinguisticPatternStructure(0,0); psNeg = new 
LinguisticPatternStructure(0,0);
+       public JSMDecision buildLearningModel(List<String> posTexts, 
List<String> negTexts,
+                                        String unknown, String[] 
separationKeywords){
+               psPos = new LinguisticPatternStructure(0,0);
+    psNeg = new LinguisticPatternStructure(0,0);
                if (separationKeywords!=null){ // re-sort by occurrence of 
separation keyword
-                       Pair<List<String>, List<String>> pair = 
reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords );
+                       Pair<List<String>, List<String>> pair = 
reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords);
                        posTexts = pair.getFirst(); negTexts =  
pair.getSecond();
                }
 
@@ -96,8 +95,8 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
 
                for(int iConcept = 0; iConcept<psNeg.conceptList.size(); 
iConcept++){
                        for (List<List<ParseTreeChunk>> negIntersection : 
negIntersections) {
-                               intersection = md
-                                                               
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent,
 negIntersection);
+                               intersection = 
md.matchTwoSentencesGroupedChunksDeterministic(
+                psNeg.conceptList.get(iConcept).intent, negIntersection);
                                if (reduceList(intersection).size() > 0)
                                        
posIntersectionsUnderNeg.add(reduceList(intersection));
                        }
@@ -112,8 +111,8 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
                        }
                }
 
-               List<ParseTreeChunk>posIntersectionsUnderNegLst = 
flattenParseTreeChunkLst(posIntersectionsUnderNeg);
-               
List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
+               List<ParseTreeChunk> posIntersectionsUnderNegLst = 
flattenParseTreeChunkLst(posIntersectionsUnderNeg);
+               List<ParseTreeChunk> 
negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
 
                posIntersectionsUnderNegLst = 
subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst);
                negIntersectionsUnderPosLst= 
subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst);
@@ -135,13 +134,10 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
        }
 
        private List<List<ParseTreeChunk>> 
computeIntersectionWithIntentExtendedByDeduction(
-                       LinguisticPatternStructure psPos, int iConcept,
-                       List<List<ParseTreeChunk>> chunksUnknown) {
+                       LinguisticPatternStructure psPos, int iConcept, 
List<List<ParseTreeChunk>> chunksUnknown) {
                
-               List<List<ParseTreeChunk>> intent = 
psPos.conceptList.get(iConcept).intent, 
-                               intentExtendedByDeduction = new ArrayList<>();
+               List<List<ParseTreeChunk>> intent = 
psPos.conceptList.get(iConcept).intent, intentExtendedByDeduction = new 
ArrayList<>();
                
-       
                for(  List<ParseTreeChunk> group: intent){
                        List<ParseTreeChunk> newGroup = new ArrayList<>();
                        for(ParseTreeChunk ch: group){
@@ -153,9 +149,7 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
                        }
                        intentExtendedByDeduction .add(newGroup);
                } 
-                return md
-                       
.matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction, 
chunksUnknown);
-               
+    return 
md.matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction, 
chunksUnknown);
        }
     
        // for list of words in a phrase, identify if it includes a separation 
word/multiword and get respective clause body 
@@ -176,7 +170,7 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
 
        public Pair<List<String>, List<String>>  
reGroupByOccurrenceOfSeparationKeyword(List<String> posTexts, List<String> 
negTexts, String[] keywords){
                List<String> posTextsNew = new ArrayList<>(), negTextsNew = new 
ArrayList<>();
-               for(String posText:posTexts){
+               for(String posText:posTexts) {
                        boolean multiwordOccurs = true;
                        for(String keyword: keywords){
                                if (!posText.contains(keyword)) {
@@ -190,7 +184,7 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
                        else
                                negTextsNew.add(posText);
                }
-               for(String negText:negTexts){
+               for(String negText:negTexts) {
                        boolean multiwordOccurs = true;
                        for(String keyword: keywords){
                                if (!negText.contains(keyword)) {
@@ -204,8 +198,6 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
                        else
                                negTextsNew.add(negText);
                }
-
-
                return new Pair<>(posTextsNew, negTextsNew);
        }
 
@@ -234,10 +226,5 @@ public class JSMLearnerOnLatticeWithDeduction extends 
JSMLearnerOnLatticeBase{
                // Finally, do prediction
                JSMDecision dec = // may be determined by ...
                                jsm.buildLearningModel(Arrays.asList(posArr), 
Arrays.asList(negArr), unknown , new String[]{"property"});
-               
-               
-               
-
-
        }
 }
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
index 4a25654..bf78e15 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
@@ -60,7 +60,10 @@ import org.apache.solr.search.SolrIndexSearcher;
 
 public class SyntGenRequestHandler extends SearchHandler {
 
-       private final ParseTreeChunkListScorer parseTreeChunkListScorer = new 
ParseTreeChunkListScorer();
+  private static final String SCORE = "score";
+  private static final String RESPONSE = "response";
+  private static final String PREFIX_QUERY = "q=";
+  private final ParseTreeChunkListScorer parseTreeChunkListScorer = new 
ParseTreeChunkListScorer();
 
        public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse 
rsp){
                try {
@@ -77,7 +80,7 @@ public class SyntGenRequestHandler extends SearchHandler {
 
                //modify rsp
                NamedList<Object> values = rsp.getValues();
-               ResultContext c = (ResultContext) values.get("response");
+               ResultContext c = (ResultContext) values.get(RESPONSE);
                if (c==null)
                        return;
 
@@ -97,13 +100,12 @@ public class SyntGenRequestHandler extends SearchHandler {
                        e.printStackTrace();
                }
                // c.docs = dListResult;
-               values.remove("response");
+               values.remove(RESPONSE);
 
                rsp.setAllValues(values);
        }
 
-       public DocList filterResultsBySyntMatchReduceDocSet(DocList docList,
-                       SolrQueryRequest req,  SolrParams params) {
+       public DocList filterResultsBySyntMatchReduceDocSet(DocList docList, 
SolrQueryRequest req,  SolrParams params) {
                //if (!docList.hasScores())
                //      return docList;
 
@@ -117,7 +119,7 @@ public class SyntGenRequestHandler extends SearchHandler {
                String requestExpression = req.getParamString();
                String[] exprParts = requestExpression.split("&");
                for(String part: exprParts){
-                       if (part.startsWith("q="))
+                       if (part.startsWith(PREFIX_QUERY))
                                requestExpression = part;
                }
                String fieldNameQuery = 
StringUtils.substringBetween(requestExpression, "=", ":");
@@ -126,7 +128,7 @@ public class SyntGenRequestHandler extends SearchHandler {
                if  (queryParts.length>=2 && queryParts[1].length()>5)
                        requestExpression = queryParts[1].replace('+', ' ');
                else if (requestExpression.contains(":")) {// still field-based 
expression
-                       requestExpression = 
requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' 
').replaceAll("  ", " ").replace("q=", "");
+                       requestExpression = 
requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' 
').replaceAll("  ", " ").replace(PREFIX_QUERY, "");
                }
 
                if (fieldNameQuery ==null)
@@ -217,7 +219,7 @@ public class SyntGenRequestHandler extends SearchHandler {
                int numFound = 0;
                List<SolrDocument> slice = new ArrayList<>();
                for (SolrDocument sdoc : results) {
-                       Float score = (Float) sdoc.getFieldValue("score");
+                       Float score = (Float) sdoc.getFieldValue(SCORE);
                        if (maxScore < score) {
                                maxScore = score;
                        }
@@ -231,13 +233,13 @@ public class SyntGenRequestHandler extends SearchHandler {
                results.setNumFound(numFound);
                results.setMaxScore(maxScore);
                results.setStart(start);
-               rsp.add("response", results);
+               rsp.add(RESPONSE, results);
 
        }
 
 
        private Query buildFilter(String[] fqs, SolrQueryRequest req)
-       throws IOException, ParseException {
+          throws IOException, ParseException {
                if (fqs != null && fqs.length > 0) {
                        BooleanQuery.Builder fquery =  new 
BooleanQuery.Builder();
                        for (String fq : fqs) {
@@ -254,17 +256,16 @@ public class SyntGenRequestHandler extends SearchHandler {
                return null;
        }
 
-       private void doSearch1(SolrDocumentList results,
-                       SolrIndexSearcher searcher, String q, Query filter,
-                       int ndocs, SolrQueryRequest req,
-                       Map<String,SchemaField> fields, Set<Integer> 
alreadyFound) 
-       throws IOException {
+       private void doSearch1(SolrDocumentList results, SolrIndexSearcher 
searcher,
+                         String q, Query filter, int ndocs, SolrQueryRequest 
req,
+                         Map<String,SchemaField> fields, Set<Integer> 
alreadyFound)
+          throws IOException {
 
                // build custom query and extra fields
                Map<String,Object> extraFields = new HashMap<>();
                extraFields.put("search_type", "search1");
                boolean includeScore = 
-                       req.getParams().get(CommonParams.FL).contains("score");
+                       req.getParams().get(CommonParams.FL).contains(SCORE);
 
                int  maxDocsPerSearcherType = 0;
                float maprelScoreCutoff = 2.0f;
@@ -296,7 +297,7 @@ public class SyntGenRequestHandler extends SearchHandler {
                                sdoc.addField(extraField, 
extraFields.get(extraField));
                        }
                        if (includeScore) {
-                               sdoc.addField("score", hit.score);
+                               sdoc.addField(SCORE, hit.score);
                        }
                        results.add(sdoc);
                        alreadyFound.add(hit.doc);
@@ -315,9 +316,3 @@ public class SyntGenRequestHandler extends SearchHandler {
        }
 
 }
-
-/*
- * 
- * 
- * 
http://localhost:8080/solr/syntgen/?q=add-style-to-your-every-day-fresh-design-iphone-cases&t1=Personalized+iPhone+Cases&d1=Add+style+to+your+every+day+with+a+custom+iPhone+case&t2=Personalized+iPhone+Cases&d2=Add+style+to+your+every+day+with+a+custom+iPhone+case&t3=Personalized+iPhone+Cases&d3=Add+style+to+your+every+day+with+a+custom+iPhone+case&t4=Personalized+iPhone+Cases&d4=add+style+to+your+every+day+with+a+custom+iPhone+case
- * */
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
index 8224273..54faa5d 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
@@ -24,7 +24,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
 import org.apache.commons.lang3.StringUtils;
 
 import opennlp.tools.parse_thicket.ParseTreeNode;
@@ -32,7 +32,13 @@ import opennlp.tools.parse_thicket.ParseTreeNode;
 public class ParseTreeChunk implements Serializable {
 
        private static final long serialVersionUID = -9007722991829174647L;
-       private String mainPOS;
+  private static final String COLON = ":";
+  private static final String ASTERISK = "*";
+  private static final String DASH = "-";
+  private static final String WHITESPACE = " ";
+  private static final String HASH = "#";
+
+  private String mainPOS;
 
        private List<String> lemmas;
 
@@ -71,7 +77,7 @@ public class ParseTreeChunk implements Serializable {
                this.mainPOS = StringUtils.substringBetween(phrStr, ">", "'");
                for(String part: parts){
                        String lemma = StringUtils.substringBetween(part, "P'", 
"':");
-                       String pos = part.substring(part.indexOf(":")+1, 
part.length());
+                       String pos = part.substring(part.indexOf(COLON)+1, 
part.length());
                        
                        if (pos==null || lemma ==null){
                                continue;
@@ -173,7 +179,7 @@ public class ParseTreeChunk implements Serializable {
        public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) {
                List<ParseTreeChunk> chunksResults = new ArrayList<>();
                for (LemmaPair chunk : parseResults) {
-                       String[] lemmasAr = chunk.getLemma().split(" ");
+                       String[] lemmasAr = chunk.getLemma().split(WHITESPACE);
                        List<String> poss = new ArrayList<>(), lems = new 
ArrayList<>();
                        for (String lem : lemmasAr) {
                                lems.add(lem);
@@ -220,9 +226,11 @@ public class ParseTreeChunk implements Serializable {
 
        // groups noun phrases, verb phrases, propos phrases etc. for separate 
match
 
-       public List<List<ParseTreeChunk>> groupChunksAsParses(
-                       List<ParseTreeChunk> parseResults) {
-               List<ParseTreeChunk> np = new ArrayList<>(), vp = new 
ArrayList<>(), prp = new ArrayList<>(), sbarp = new ArrayList<>(), pp = new 
ArrayList<>(), adjp = new ArrayList<>(), whadvp = new ArrayList<>(), 
restOfPhrasesTypes = new ArrayList<>();
+       public List<List<ParseTreeChunk>> 
groupChunksAsParses(List<ParseTreeChunk> parseResults) {
+               List<ParseTreeChunk> np = new ArrayList<>(), vp = new 
ArrayList<>(), prp = new ArrayList<>(),
+            sbarp = new ArrayList<>(), pp = new ArrayList<>(),
+            adjp = new ArrayList<>(), whadvp = new ArrayList<>(),
+            restOfPhrasesTypes = new ArrayList<>();
                List<List<ParseTreeChunk>> results = new ArrayList<>();
                for (ParseTreeChunk ch : parseResults) {
                        String mainPos = ch.getMainPOS().toLowerCase();
@@ -276,7 +284,7 @@ public class ParseTreeChunk implements Serializable {
        public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks(
                        List<List<ParseTreeChunk>> sent1, 
List<List<ParseTreeChunk>> sent2) {
                List<List<ParseTreeChunk>> results = new ArrayList<>();
-               // first irerate through component
+               // first iterate through component
                for (int comp = 0; comp < 2 && // just np & vp
                                comp < sent1.size() && comp < sent2.size(); 
comp++) {
                        List<ParseTreeChunk> resultComps = new ArrayList<>();
@@ -284,8 +292,7 @@ public class ParseTreeChunk implements Serializable {
                        for (ParseTreeChunk ch1 : sent1.get(comp)) {
                                for (ParseTreeChunk ch2 : sent2.get(comp)) { // 
simpler version
                                        ParseTreeChunk chunkToAdd = 
parseTreeMatcher
-                                                       
.generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
-                                                                       ch1, 
ch2);
+                                                       
.generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(ch1, ch2);
 
                                        if 
(!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {
                                                continue; // if the words which 
have to stay do not stay, proceed to
@@ -298,8 +305,7 @@ public class ParseTreeChunk implements Serializable {
                                                        break;
                                                }
 
-                                               if (parseTreeMatcher
-                                                               
.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
+                                               if 
(parseTreeMatcher.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
                                                                                
chunkToAdd).equalsTo(chunkToAdd)) {
                                                        alreadyThere = true;
                                                        break;
@@ -371,7 +377,7 @@ public class ParseTreeChunk implements Serializable {
                        }
                        
                        // this => *  ch=> run
-                       if (!this.lemmas.get(i).equals(lems.get(i)) && 
this.lemmas.get(i).equals("*")) 
+                       if (!this.lemmas.get(i).equals(lems.get(i)) && 
this.lemmas.get(i).equals(ASTERISK))
                                notSubChunkWithGivenAlignment = true;
                }
                if (!notSubChunkWithGivenAlignment && !unComparable)
@@ -395,7 +401,7 @@ public class ParseTreeChunk implements Serializable {
                        }
                        
                        // this => *  ch=> run
-                       if (!thisLemma.get(i).equals(chLemma.get(i)) && 
thisLemma.get(i).equals("*")) 
+                       if (!thisLemma.get(i).equals(chLemma.get(i)) && 
thisLemma.get(i).equals(ASTERISK))
                                notSubChunkWithGivenAlignment = true;
                }
                
@@ -430,11 +436,11 @@ public class ParseTreeChunk implements Serializable {
                if (mainPOS != null)
                        buf = new StringBuilder(mainPOS + " [");
                for (int i = 0; i < lemmas.size() && i < POSs.size() ; i++) {
-                       
buf.append(POSs.get(i)).append("-").append(lemmas.get(i)).append(" ");
+                       
buf.append(POSs.get(i)).append(DASH).append(lemmas.get(i)).append(WHITESPACE);
                        if (this.parseTreeNodes!=null){
                                Map<String, Object> attrs = 
this.parseTreeNodes.get(i).getAttributes();
                                if (attrs!=null && attrs.keySet().size()>0){
-                                       buf.append(attrs).append(" ");
+                                       buf.append(attrs).append(WHITESPACE);
                                }
                                String ner =this.parseTreeNodes.get(i).getNe();
                                if (ner!=null && ner.length()>1)
@@ -448,7 +454,7 @@ public class ParseTreeChunk implements Serializable {
                StringBuilder buf = new StringBuilder();
 
                for (String lemma : lemmas) {
-                       buf.append(lemma).append(" ");
+                       buf.append(lemma).append(WHITESPACE);
                }
                return buf.toString().trim();
        }
@@ -463,25 +469,25 @@ public class ParseTreeChunk implements Serializable {
 
        public String listToString(List<List<ParseTreeChunk>> chunks) {
                StringBuilder buf = new StringBuilder();
-               if (chunks.get(0).size() > 0) {
+               if (!chunks.get(0).isEmpty()) {
                        buf.append(" np ").append(chunks.get(0).toString());
                }
-               if (chunks.get(1).size() > 0) {
+               if (!chunks.get(1).isEmpty()) {
                        buf.append(" vp ").append(chunks.get(1).toString());
                }
                if (chunks.size() < 3) {
                        return buf.toString();
                }
-               if (chunks.get(2).size() > 0) {
+               if (!chunks.get(2).isEmpty()) {
                        buf.append(" prp ").append(chunks.get(2).toString());
                }
-               if (chunks.get(3).size() > 0) {
+               if (!chunks.get(3).isEmpty()) {
                        buf.append(" pp ").append(chunks.get(3).toString());
                }
-               if (chunks.get(4).size() > 0) {
+               if (!chunks.get(4).isEmpty()) {
                        buf.append(" adjp ").append(chunks.get(4).toString());
                }
-               if (chunks.get(5).size() > 0) {
+               if (!chunks.get(5).isEmpty()) {
                        buf.append(" whadvp ").append(chunks.get(5).toString());
                }
                /*
@@ -502,17 +508,17 @@ public class ParseTreeChunk implements Serializable {
                toParse = toParse.replace(" ]], [ [", "&");
                String[] phraseTypeFragments = toParse.trim().split("&");
                for (String toParseFragm : phraseTypeFragments) {
-                       toParseFragm = toParseFragm.replace("],  [", "#");
+                       toParseFragm = toParseFragm.replace("],  [", HASH);
 
                        List<ParseTreeChunk> resultsPhraseType = new 
ArrayList<>();
-                       String[] indivChunks = toParseFragm.trim().split("#");
+                       String[] indivChunks = toParseFragm.trim().split(HASH);
                        for (String expr : indivChunks) {
                                List<String> lems = new ArrayList<>(), poss = 
new ArrayList<>();
                                expr = expr.replace("[", "").replace(" ]", "");
-                               String[] pairs = expr.trim().split(" ");
+                               String[] pairs = expr.trim().split(WHITESPACE);
                                for (String word : pairs) {
                                        word = word.replace("]]", 
"").replace("]", "");
-                                       String[] pos_lem = word.split("-");
+                                       String[] pos_lem = word.split(DASH);
                                        lems.add(pos_lem[1].trim());
                                        poss.add(pos_lem[0].trim());
                                }

(opennlp-sandbox) branch main updated: Modernize opennlp-similarity component (#387)

Reply via email to