This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/main by this push:
new e8610a7 Modernize opennlp-similarity component (#387)
e8610a7 is described below
commit e8610a724a00ab4a829102c33c6fca46004e5f6e
Author: Martin Wiesner <[email protected]>
AuthorDate: Sat Nov 29 16:42:19 2025 +0100
Modernize opennlp-similarity component (#387)
---
opennlp-similarity/pom.xml | 19 +++++--
.../tools/doc_classifier/DocClassifier.java | 40 +++++----------
.../java/opennlp/tools/fca/BasicLevelMetrics.java | 19 +++----
.../java/opennlp/tools/fca/ConceptLattice.java | 5 +-
.../JSMLearnerOnLatticeWithDeduction.java | 43 ++++++----------
.../apps/solr/SyntGenRequestHandler.java | 41 +++++++--------
.../tools/textsimilarity/ParseTreeChunk.java | 60 ++++++++++++----------
7 files changed, 103 insertions(+), 124 deletions(-)
diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml
index 3fcbcbe..06a5360 100644
--- a/opennlp-similarity/pom.xml
+++ b/opennlp-similarity/pom.xml
@@ -31,8 +31,10 @@
<jakarta.mail.version>2.1.4</jakarta.mail.version>
<org.json.version>20250517</org.json.version>
+ <lucene.version>9.12.3</lucene.version>
+ <solr.version>9.10.0</solr.version>
<tika.version>3.2.3</tika.version>
- <solr.version>8.11.4</solr.version>
+
<docx4j.version>11.5.7</docx4j.version>
<dl4j.version>1.0.0-M2.1</dl4j.version>
<hdf5.version>1.14.3-1.5.10</hdf5.version>
@@ -101,12 +103,12 @@
<scope>runtime</scope>
</dependency>
<!-- End model resources -->
-
+
<dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <scope>runtime</scope>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-collections4</artifactId>
</dependency>
+
<dependency>
<groupId>jakarta.xml.bind</groupId>
<artifactId>jakarta.xml.bind-api</artifactId>
@@ -158,6 +160,13 @@
</exclusions>
</dependency>
+ <!-- required for solr in the chosen version -->
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queryparser</artifactId>
+ <version>${lucene.version}</version>
+ </dependency>
+
<dependency>
<groupId>edu.mit</groupId>
<artifactId>jverbnet</artifactId>
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
index 784ebb2..5f50277 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
@@ -49,7 +49,9 @@ public class DocClassifier {
private static final Logger LOGGER =
LoggerFactory.getLogger(DocClassifier.class);
public static final String DOC_CLASSIFIER_KEY = "doc_class";
+ public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map";
public static final String RESOURCE_DIR = null;
+
private Map<String, Float> scoredClasses;
public static final Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f;
@@ -66,22 +68,10 @@ public class DocClassifier {
// to accumulate classif results
private final CountItemsList<String> localCats = new CountItemsList<>();
private static final int MAX_TOKENS_TO_FORM = 30;
- private final String CAT_COMPUTING = "computing";
- public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map";
- private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60; // if
- // sentence
- // is
- // shorter,
- // should
- // not
- // be
- // used
- // for
- // classification
- private static final int MIN_CHARS_IN_QUERY = 30; // if combination of
- // keywords are shorter,
- // should not be used
- // for classification
+ // if sentence is shorter, should not be used for classification
+ private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60;
+ // if combination of keywords are shorter, should not be used for
classification
+ private static final int MIN_CHARS_IN_QUERY = 30;
// these are categories from the index
public static final String[] CATEGORIES = new String[]
@@ -94,13 +84,13 @@ public class DocClassifier {
try {
indexDirectory = FSDirectory.open(new
File(INDEX_PATH).toPath());
} catch (IOException e2) {
- LOGGER.error("problem opening index " + e2);
+ LOGGER.error("problem opening index {}", String.valueOf(e2));
}
try {
indexReader =
DirectoryReader.open(indexDirectory);
indexSearcher = new IndexSearcher(indexReader);
} catch (IOException e2) {
- LOGGER.error("problem reading index \n" + e2);
+ LOGGER.error("problem reading index \n{}", String.valueOf(e2));
}
}
}
@@ -134,7 +124,7 @@ public class DocClassifier {
} catch (IOException e1) {
LOGGER.error("problem searching index \n", e1);
}
- LOGGER.debug("Found " + hits.totalHits + " hits for " +
queryStr);
+ LOGGER.debug("Found {} hits for {}", hits.totalHits, queryStr);
int count = 0;
@@ -143,8 +133,7 @@ public class DocClassifier {
try {
doc = indexSearcher.doc(scoreDoc.doc);
} catch (IOException e) {
- LOGGER.error("Problem searching training set
for classif \n"
- + e);
+ LOGGER.error("Problem searching training set for classif \n{}",
String.valueOf(e));
continue;
}
String flag = doc.get("class");
@@ -170,8 +159,7 @@ public class DocClassifier {
if (scoredClasses.get(key) >
MIN_TOTAL_SCORE_FOR_CATEGORY)
resultsAboveThresh.add(key);
else
- LOGGER.debug("Too low score of " +
scoredClasses.get(key)
- + " for category = " +
key);
+ LOGGER.debug("Too low score of {} for category = {}",
scoredClasses.get(key), key);
}
int len = resultsAboveThresh.size();
@@ -182,7 +170,7 @@ public class DocClassifier {
else
results = resultsAboveThresh;
} catch (Exception e) {
- LOGGER.error("Problem aggregating search results\n" +
e);
+ LOGGER.error("Problem aggregating search results\n{}",
String.valueOf(e));
}
if (results.size() < 2)
return results;
@@ -262,9 +250,9 @@ public class DocClassifier {
continue;
String query = formClassifQuery(sentence,
MAX_TOKENS_TO_FORM);
classifResults = classifySentence(query);
- if (classifResults != null &&
classifResults.size() > 0) {
+ if (classifResults != null &&
!classifResults.isEmpty()) {
localCats.addAll(classifResults);
- LOGGER.debug(sentence + " => " +
classifResults);
+ LOGGER.debug("{} => {}", sentence, classifResults);
}
}
} catch (Exception e) {
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
b/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
index 668e0ab..cc619cc 100755
--- a/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java
@@ -19,16 +19,16 @@ package opennlp.tools.fca;
import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
public class BasicLevelMetrics {
- final ConceptLattice cl;
- ArrayList<ArrayList<Integer>> attributesExtent;
- ArrayList<ArrayList<Integer>> objectsIntent = null;
- ArrayList<Integer> attributes = null;
+ private final ConceptLattice cl;
+ private List<ArrayList<Integer>> attributesExtent;
+ private List<Integer> attributes = null;
private final double[][] objectsSimilarityJ;
private final double [][] objectsSimilaritySMC;
@@ -41,7 +41,7 @@ public class BasicLevelMetrics {
public void setUp(){
attributesExtent = new ArrayList<>();
- objectsIntent = new ArrayList<>();
+ List<ArrayList<Integer>> objectsIntent = new ArrayList<>();
attributes = new ArrayList<>();
for (int i=0;i<cl.attributeCount;i++){
@@ -66,15 +66,10 @@ public class BasicLevelMetrics {
objectsSimilarityJ[i][i] = 1;
objectsSimilaritySMC[i][i] = 1;
}
-
- //System.out.println("J");
- //System.out.println(Arrays.deepToString(objectsSimilarityJ));
- //System.out.println("SMC");
- //System.out.println(Arrays.deepToString(objectsSimilaritySMC));
}
- //Utility functions for Similarity approach (S)
+ // Utility functions for Similarity approach (S)
public double simSMC (ArrayList<Integer> intent1,
ArrayList<Integer>intent2){
int tp = (ListUtils.intersection(intent1,intent2)).size();
ArrayList<Integer> fnlst = new ArrayList<>(this.attributes);
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
b/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
index 6d59154..5735b31 100755
--- a/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java
@@ -26,7 +26,7 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
public class ConceptLattice {
int objectCount;
@@ -88,7 +88,6 @@ public class ConceptLattice {
}
}
-
public int GetMaximalConcept(List<Integer> intent, int Generator) {
boolean parentIsMaximal = true;
while(parentIsMaximal) {
@@ -105,7 +104,7 @@ public class ConceptLattice {
}
public void AddExtentToAncestors(LinkedHashSet<Integer>extent, int
curNode) {
- if (conceptList.get(curNode).parents.size()>0){
+ if (!conceptList.get(curNode).parents.isEmpty()){
for (int parent : conceptList.get(curNode).parents){
conceptList.get(parent).addExtents(extent);
AddExtentToAncestors(extent, parent);
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
index d9c8b83..6b9f32d 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java
@@ -22,7 +22,7 @@ import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
import
opennlp.tools.parse_thicket.pattern_structure.LinguisticPatternStructure;
import opennlp.tools.similarity.apps.utils.Pair;
@@ -31,13 +31,12 @@ import opennlp.tools.textsimilarity.ParseTreeChunk;
public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{
final List<JSMDecision> accumulatedJSMResults = new ArrayList<>();
-
-
- public JSMDecision buildLearningModel(List<String> posTexts,
List<String> negTexts,
- String unknown, String[] separationKeywords){
- psPos = new LinguisticPatternStructure(0,0); psNeg = new
LinguisticPatternStructure(0,0);
+ public JSMDecision buildLearningModel(List<String> posTexts,
List<String> negTexts,
+ String unknown, String[]
separationKeywords){
+ psPos = new LinguisticPatternStructure(0,0);
+ psNeg = new LinguisticPatternStructure(0,0);
if (separationKeywords!=null){ // re-sort by occurrence of
separation keyword
- Pair<List<String>, List<String>> pair =
reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords );
+ Pair<List<String>, List<String>> pair =
reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords);
posTexts = pair.getFirst(); negTexts =
pair.getSecond();
}
@@ -96,8 +95,8 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
for(int iConcept = 0; iConcept<psNeg.conceptList.size();
iConcept++){
for (List<List<ParseTreeChunk>> negIntersection :
negIntersections) {
- intersection = md
-
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent,
negIntersection);
+ intersection =
md.matchTwoSentencesGroupedChunksDeterministic(
+ psNeg.conceptList.get(iConcept).intent, negIntersection);
if (reduceList(intersection).size() > 0)
posIntersectionsUnderNeg.add(reduceList(intersection));
}
@@ -112,8 +111,8 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
}
}
- List<ParseTreeChunk>posIntersectionsUnderNegLst =
flattenParseTreeChunkLst(posIntersectionsUnderNeg);
-
List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
+ List<ParseTreeChunk> posIntersectionsUnderNegLst =
flattenParseTreeChunkLst(posIntersectionsUnderNeg);
+ List<ParseTreeChunk>
negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
posIntersectionsUnderNegLst =
subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst);
negIntersectionsUnderPosLst=
subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst);
@@ -135,13 +134,10 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
}
private List<List<ParseTreeChunk>>
computeIntersectionWithIntentExtendedByDeduction(
- LinguisticPatternStructure psPos, int iConcept,
- List<List<ParseTreeChunk>> chunksUnknown) {
+ LinguisticPatternStructure psPos, int iConcept,
List<List<ParseTreeChunk>> chunksUnknown) {
- List<List<ParseTreeChunk>> intent =
psPos.conceptList.get(iConcept).intent,
- intentExtendedByDeduction = new ArrayList<>();
+ List<List<ParseTreeChunk>> intent =
psPos.conceptList.get(iConcept).intent, intentExtendedByDeduction = new
ArrayList<>();
-
for( List<ParseTreeChunk> group: intent){
List<ParseTreeChunk> newGroup = new ArrayList<>();
for(ParseTreeChunk ch: group){
@@ -153,9 +149,7 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
}
intentExtendedByDeduction .add(newGroup);
}
- return md
-
.matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction,
chunksUnknown);
-
+ return
md.matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction,
chunksUnknown);
}
// for list of words in a phrase, identify if it includes a separation
word/multiword and get respective clause body
@@ -176,7 +170,7 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
public Pair<List<String>, List<String>>
reGroupByOccurrenceOfSeparationKeyword(List<String> posTexts, List<String>
negTexts, String[] keywords){
List<String> posTextsNew = new ArrayList<>(), negTextsNew = new
ArrayList<>();
- for(String posText:posTexts){
+ for(String posText:posTexts) {
boolean multiwordOccurs = true;
for(String keyword: keywords){
if (!posText.contains(keyword)) {
@@ -190,7 +184,7 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
else
negTextsNew.add(posText);
}
- for(String negText:negTexts){
+ for(String negText:negTexts) {
boolean multiwordOccurs = true;
for(String keyword: keywords){
if (!negText.contains(keyword)) {
@@ -204,8 +198,6 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
else
negTextsNew.add(negText);
}
-
-
return new Pair<>(posTextsNew, negTextsNew);
}
@@ -234,10 +226,5 @@ public class JSMLearnerOnLatticeWithDeduction extends
JSMLearnerOnLatticeBase{
// Finally, do prediction
JSMDecision dec = // may be determined by ...
jsm.buildLearningModel(Arrays.asList(posArr),
Arrays.asList(negArr), unknown , new String[]{"property"});
-
-
-
-
-
}
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
index 4a25654..bf78e15 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
@@ -60,7 +60,10 @@ import org.apache.solr.search.SolrIndexSearcher;
public class SyntGenRequestHandler extends SearchHandler {
- private final ParseTreeChunkListScorer parseTreeChunkListScorer = new
ParseTreeChunkListScorer();
+ private static final String SCORE = "score";
+ private static final String RESPONSE = "response";
+ private static final String PREFIX_QUERY = "q=";
+ private final ParseTreeChunkListScorer parseTreeChunkListScorer = new
ParseTreeChunkListScorer();
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse
rsp){
try {
@@ -77,7 +80,7 @@ public class SyntGenRequestHandler extends SearchHandler {
//modify rsp
NamedList<Object> values = rsp.getValues();
- ResultContext c = (ResultContext) values.get("response");
+ ResultContext c = (ResultContext) values.get(RESPONSE);
if (c==null)
return;
@@ -97,13 +100,12 @@ public class SyntGenRequestHandler extends SearchHandler {
e.printStackTrace();
}
// c.docs = dListResult;
- values.remove("response");
+ values.remove(RESPONSE);
rsp.setAllValues(values);
}
- public DocList filterResultsBySyntMatchReduceDocSet(DocList docList,
- SolrQueryRequest req, SolrParams params) {
+ public DocList filterResultsBySyntMatchReduceDocSet(DocList docList,
SolrQueryRequest req, SolrParams params) {
//if (!docList.hasScores())
// return docList;
@@ -117,7 +119,7 @@ public class SyntGenRequestHandler extends SearchHandler {
String requestExpression = req.getParamString();
String[] exprParts = requestExpression.split("&");
for(String part: exprParts){
- if (part.startsWith("q="))
+ if (part.startsWith(PREFIX_QUERY))
requestExpression = part;
}
String fieldNameQuery =
StringUtils.substringBetween(requestExpression, "=", ":");
@@ -126,7 +128,7 @@ public class SyntGenRequestHandler extends SearchHandler {
if (queryParts.length>=2 && queryParts[1].length()>5)
requestExpression = queryParts[1].replace('+', ' ');
else if (requestExpression.contains(":")) {// still field-based
expression
- requestExpression =
requestExpression.replaceAll(fieldNameQuery+":", "").replace('+','
').replaceAll(" ", " ").replace("q=", "");
+ requestExpression =
requestExpression.replaceAll(fieldNameQuery+":", "").replace('+','
').replaceAll(" ", " ").replace(PREFIX_QUERY, "");
}
if (fieldNameQuery ==null)
@@ -217,7 +219,7 @@ public class SyntGenRequestHandler extends SearchHandler {
int numFound = 0;
List<SolrDocument> slice = new ArrayList<>();
for (SolrDocument sdoc : results) {
- Float score = (Float) sdoc.getFieldValue("score");
+ Float score = (Float) sdoc.getFieldValue(SCORE);
if (maxScore < score) {
maxScore = score;
}
@@ -231,13 +233,13 @@ public class SyntGenRequestHandler extends SearchHandler {
results.setNumFound(numFound);
results.setMaxScore(maxScore);
results.setStart(start);
- rsp.add("response", results);
+ rsp.add(RESPONSE, results);
}
private Query buildFilter(String[] fqs, SolrQueryRequest req)
- throws IOException, ParseException {
+ throws IOException, ParseException {
if (fqs != null && fqs.length > 0) {
BooleanQuery.Builder fquery = new
BooleanQuery.Builder();
for (String fq : fqs) {
@@ -254,17 +256,16 @@ public class SyntGenRequestHandler extends SearchHandler {
return null;
}
- private void doSearch1(SolrDocumentList results,
- SolrIndexSearcher searcher, String q, Query filter,
- int ndocs, SolrQueryRequest req,
- Map<String,SchemaField> fields, Set<Integer>
alreadyFound)
- throws IOException {
+ private void doSearch1(SolrDocumentList results, SolrIndexSearcher
searcher,
+ String q, Query filter, int ndocs, SolrQueryRequest
req,
+ Map<String,SchemaField> fields, Set<Integer>
alreadyFound)
+ throws IOException {
// build custom query and extra fields
Map<String,Object> extraFields = new HashMap<>();
extraFields.put("search_type", "search1");
boolean includeScore =
- req.getParams().get(CommonParams.FL).contains("score");
+ req.getParams().get(CommonParams.FL).contains(SCORE);
int maxDocsPerSearcherType = 0;
float maprelScoreCutoff = 2.0f;
@@ -296,7 +297,7 @@ public class SyntGenRequestHandler extends SearchHandler {
sdoc.addField(extraField,
extraFields.get(extraField));
}
if (includeScore) {
- sdoc.addField("score", hit.score);
+ sdoc.addField(SCORE, hit.score);
}
results.add(sdoc);
alreadyFound.add(hit.doc);
@@ -315,9 +316,3 @@ public class SyntGenRequestHandler extends SearchHandler {
}
}
-
-/*
- *
- *
- *
http://localhost:8080/solr/syntgen/?q=add-style-to-your-every-day-fresh-design-iphone-cases&t1=Personalized+iPhone+Cases&d1=Add+style+to+your+every+day+with+a+custom+iPhone+case&t2=Personalized+iPhone+Cases&d2=Add+style+to+your+every+day+with+a+custom+iPhone+case&t3=Personalized+iPhone+Cases&d3=Add+style+to+your+every+day+with+a+custom+iPhone+case&t4=Personalized+iPhone+Cases&d4=add+style+to+your+every+day+with+a+custom+iPhone+case
- * */
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
index 8224273..54faa5d 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java
@@ -24,7 +24,7 @@ import java.util.Collections;
import java.util.List;
import java.util.Map;
-import org.apache.commons.collections.ListUtils;
+import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.StringUtils;
import opennlp.tools.parse_thicket.ParseTreeNode;
@@ -32,7 +32,13 @@ import opennlp.tools.parse_thicket.ParseTreeNode;
public class ParseTreeChunk implements Serializable {
private static final long serialVersionUID = -9007722991829174647L;
- private String mainPOS;
+ private static final String COLON = ":";
+ private static final String ASTERISK = "*";
+ private static final String DASH = "-";
+ private static final String WHITESPACE = " ";
+ private static final String HASH = "#";
+
+ private String mainPOS;
private List<String> lemmas;
@@ -71,7 +77,7 @@ public class ParseTreeChunk implements Serializable {
this.mainPOS = StringUtils.substringBetween(phrStr, ">", "'");
for(String part: parts){
String lemma = StringUtils.substringBetween(part, "P'",
"':");
- String pos = part.substring(part.indexOf(":")+1,
part.length());
+ String pos = part.substring(part.indexOf(COLON)+1,
part.length());
if (pos==null || lemma ==null){
continue;
@@ -173,7 +179,7 @@ public class ParseTreeChunk implements Serializable {
public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) {
List<ParseTreeChunk> chunksResults = new ArrayList<>();
for (LemmaPair chunk : parseResults) {
- String[] lemmasAr = chunk.getLemma().split(" ");
+ String[] lemmasAr = chunk.getLemma().split(WHITESPACE);
List<String> poss = new ArrayList<>(), lems = new
ArrayList<>();
for (String lem : lemmasAr) {
lems.add(lem);
@@ -220,9 +226,11 @@ public class ParseTreeChunk implements Serializable {
// groups noun phrases, verb phrases, propos phrases etc. for separate
match
- public List<List<ParseTreeChunk>> groupChunksAsParses(
- List<ParseTreeChunk> parseResults) {
- List<ParseTreeChunk> np = new ArrayList<>(), vp = new
ArrayList<>(), prp = new ArrayList<>(), sbarp = new ArrayList<>(), pp = new
ArrayList<>(), adjp = new ArrayList<>(), whadvp = new ArrayList<>(),
restOfPhrasesTypes = new ArrayList<>();
+ public List<List<ParseTreeChunk>>
groupChunksAsParses(List<ParseTreeChunk> parseResults) {
+ List<ParseTreeChunk> np = new ArrayList<>(), vp = new
ArrayList<>(), prp = new ArrayList<>(),
+ sbarp = new ArrayList<>(), pp = new ArrayList<>(),
+ adjp = new ArrayList<>(), whadvp = new ArrayList<>(),
+ restOfPhrasesTypes = new ArrayList<>();
List<List<ParseTreeChunk>> results = new ArrayList<>();
for (ParseTreeChunk ch : parseResults) {
String mainPos = ch.getMainPOS().toLowerCase();
@@ -276,7 +284,7 @@ public class ParseTreeChunk implements Serializable {
public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks(
List<List<ParseTreeChunk>> sent1,
List<List<ParseTreeChunk>> sent2) {
List<List<ParseTreeChunk>> results = new ArrayList<>();
- // first irerate through component
+ // first iterate through component
for (int comp = 0; comp < 2 && // just np & vp
comp < sent1.size() && comp < sent2.size();
comp++) {
List<ParseTreeChunk> resultComps = new ArrayList<>();
@@ -284,8 +292,7 @@ public class ParseTreeChunk implements Serializable {
for (ParseTreeChunk ch1 : sent1.get(comp)) {
for (ParseTreeChunk ch2 : sent2.get(comp)) { //
simpler version
ParseTreeChunk chunkToAdd =
parseTreeMatcher
-
.generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(
- ch1,
ch2);
+
.generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(ch1, ch2);
if
(!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {
continue; // if the words which
have to stay do not stay, proceed to
@@ -298,8 +305,7 @@ public class ParseTreeChunk implements Serializable {
break;
}
- if (parseTreeMatcher
-
.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
+ if
(parseTreeMatcher.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,
chunkToAdd).equalsTo(chunkToAdd)) {
alreadyThere = true;
break;
@@ -371,7 +377,7 @@ public class ParseTreeChunk implements Serializable {
}
// this => * ch=> run
- if (!this.lemmas.get(i).equals(lems.get(i)) &&
this.lemmas.get(i).equals("*"))
+ if (!this.lemmas.get(i).equals(lems.get(i)) &&
this.lemmas.get(i).equals(ASTERISK))
notSubChunkWithGivenAlignment = true;
}
if (!notSubChunkWithGivenAlignment && !unComparable)
@@ -395,7 +401,7 @@ public class ParseTreeChunk implements Serializable {
}
// this => * ch=> run
- if (!thisLemma.get(i).equals(chLemma.get(i)) &&
thisLemma.get(i).equals("*"))
+ if (!thisLemma.get(i).equals(chLemma.get(i)) &&
thisLemma.get(i).equals(ASTERISK))
notSubChunkWithGivenAlignment = true;
}
@@ -430,11 +436,11 @@ public class ParseTreeChunk implements Serializable {
if (mainPOS != null)
buf = new StringBuilder(mainPOS + " [");
for (int i = 0; i < lemmas.size() && i < POSs.size() ; i++) {
-
buf.append(POSs.get(i)).append("-").append(lemmas.get(i)).append(" ");
+
buf.append(POSs.get(i)).append(DASH).append(lemmas.get(i)).append(WHITESPACE);
if (this.parseTreeNodes!=null){
Map<String, Object> attrs =
this.parseTreeNodes.get(i).getAttributes();
if (attrs!=null && attrs.keySet().size()>0){
- buf.append(attrs).append(" ");
+ buf.append(attrs).append(WHITESPACE);
}
String ner =this.parseTreeNodes.get(i).getNe();
if (ner!=null && ner.length()>1)
@@ -448,7 +454,7 @@ public class ParseTreeChunk implements Serializable {
StringBuilder buf = new StringBuilder();
for (String lemma : lemmas) {
- buf.append(lemma).append(" ");
+ buf.append(lemma).append(WHITESPACE);
}
return buf.toString().trim();
}
@@ -463,25 +469,25 @@ public class ParseTreeChunk implements Serializable {
public String listToString(List<List<ParseTreeChunk>> chunks) {
StringBuilder buf = new StringBuilder();
- if (chunks.get(0).size() > 0) {
+ if (!chunks.get(0).isEmpty()) {
buf.append(" np ").append(chunks.get(0).toString());
}
- if (chunks.get(1).size() > 0) {
+ if (!chunks.get(1).isEmpty()) {
buf.append(" vp ").append(chunks.get(1).toString());
}
if (chunks.size() < 3) {
return buf.toString();
}
- if (chunks.get(2).size() > 0) {
+ if (!chunks.get(2).isEmpty()) {
buf.append(" prp ").append(chunks.get(2).toString());
}
- if (chunks.get(3).size() > 0) {
+ if (!chunks.get(3).isEmpty()) {
buf.append(" pp ").append(chunks.get(3).toString());
}
- if (chunks.get(4).size() > 0) {
+ if (!chunks.get(4).isEmpty()) {
buf.append(" adjp ").append(chunks.get(4).toString());
}
- if (chunks.get(5).size() > 0) {
+ if (!chunks.get(5).isEmpty()) {
buf.append(" whadvp ").append(chunks.get(5).toString());
}
/*
@@ -502,17 +508,17 @@ public class ParseTreeChunk implements Serializable {
toParse = toParse.replace(" ]], [ [", "&");
String[] phraseTypeFragments = toParse.trim().split("&");
for (String toParseFragm : phraseTypeFragments) {
- toParseFragm = toParseFragm.replace("], [", "#");
+ toParseFragm = toParseFragm.replace("], [", HASH);
List<ParseTreeChunk> resultsPhraseType = new
ArrayList<>();
- String[] indivChunks = toParseFragm.trim().split("#");
+ String[] indivChunks = toParseFragm.trim().split(HASH);
for (String expr : indivChunks) {
List<String> lems = new ArrayList<>(), poss =
new ArrayList<>();
expr = expr.replace("[", "").replace(" ]", "");
- String[] pairs = expr.trim().split(" ");
+ String[] pairs = expr.trim().split(WHITESPACE);
for (String word : pairs) {
word = word.replace("]]",
"").replace("]", "");
- String[] pos_lem = word.split("-");
+ String[] pos_lem = word.split(DASH);
lems.add(pos_lem[1].trim());
poss.add(pos_lem[0].trim());
}