This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/master by this push:
new 00a2d94 Fixes OPENNLP-1454 (#101)
00a2d94 is described below
commit 00a2d946d8facd0e7c0ed58411bdd0a28850eff6
Author: Martin Wiesner <[email protected]>
AuthorDate: Fri Apr 28 20:55:06 2023 +0200
Fixes OPENNLP-1454 (#101)
replaces many occurrences of Syouts, printStacktrace(), and JUL logging
with SLF4J as used in OPENNLP tools
introduces log4j-2.x as (managed) test-scoped runtime binding for slj4j in
`opennlp-similarity`
improves JavaDoc along the path
improves code formatting along the path
addresses PR reviewers' feedback
---
opennlp-similarity/pom.xml | 63 +++--
.../review_builder/WebPageReviewExtractor.java | 69 +++--
.../parse_thicket/apps/SnippetToParagraph.java | 66 ++---
.../tools/parse_thicket/apps/WebPageExtractor.java | 35 ++-
.../LinguisticPatternStructure.java | 68 +++--
.../pattern_structure/PhrasePatternStructure.java | 48 ++--
.../tools/similarity/apps/ContentGenerator.java | 30 ++-
.../similarity/apps/ContentGeneratorSupport.java | 52 ++--
.../apps/GeneratedSentenceProcessor.java | 224 +++++++---------
.../similarity/apps/RelatedSentenceFinder.java | 284 ++++++++++-----------
.../apps/SpeechRecognitionResultsProcessor.java | 64 ++---
.../textsimilarity/GeneralizationListReducer.java | 28 +-
.../textsimilarity/ParseTreeChunkListScorer.java | 33 ++-
.../ParserChunker2MatcherProcessor.java | 75 +++---
.../ParserChunker2MatcherProcessorTest.java | 11 +-
pom.xml | 28 ++
16 files changed, 568 insertions(+), 610 deletions(-)
diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml
index 5a8482c..5f739bb 100644
--- a/opennlp-similarity/pom.xml
+++ b/opennlp-similarity/pom.xml
@@ -89,7 +89,28 @@
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</dependency>
-
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-api</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j-impl</artifactId>
+ <scope>test</scope>
+ </dependency>
+
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
@@ -231,6 +252,17 @@
<groupId>org.docx4j</groupId>
<artifactId>docx4j</artifactId>
<version>3.3.7</version>
+ <exclusions>
+ <!-- Exclusion here as log4j version 2 bindings
are used during tests/runtime-->
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
@@ -456,6 +488,20 @@
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <argLine>-Xmx2048m
-Dfile.encoding=UTF-8</argLine>
+
<forkCount>${opennlp.forkCount}</forkCount>
+ <reuseForks>false</reuseForks>
+
<failIfNoSpecifiedTests>false</failIfNoSpecifiedTests>
+ <excludes>
+ <exclude>**/*IT.java</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+
<plugin>
<artifactId>maven-source-plugin</artifactId>
<executions>
@@ -521,20 +567,7 @@
</execution>
</executions>
</plugin>
- <!-- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-gpg-plugin</artifactId>
- <executions>
- <execution>
- <id>sign-artifacts</id>
- <phase>verify</phase>
- <goals>
- <goal>sign</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- -->
+
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java
b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java
index 38edb8e..078d56c 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java
@@ -17,6 +17,7 @@
package opennlp.tools.apps.review_builder;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -30,9 +31,13 @@ import opennlp.tools.textsimilarity.TextProcessor;
import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class WebPageReviewExtractor extends WebPageExtractor {
-
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
private final BingAPIProductSearchManager prodman = new
BingAPIProductSearchManager();
private final SentenceOriginalizer orig;
@@ -40,24 +45,20 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
orig = new SentenceOriginalizer(resourceDir);
}
- public String[] removeDuplicates(String[] hits)
- {
+ public String[] removeDuplicates(String[] hits) {
StringDistanceMeasurer meas = new StringDistanceMeasurer();
List<Integer> idsToRemove = new ArrayList<>();
List<String> hitsDedup = new ArrayList<>();
try {
for (int i = 0; i < hits.length; i++)
- for (int j = i + 1; j < hits.length; j++)
- {
+ for (int j = i + 1; j < hits.length; j++) {
String title1 = hits[i];
String title2 = hits[j];
if (StringUtils.isEmpty(title1) ||
StringUtils.isEmpty(title2))
continue;
- if (meas.measureStringDistance(title1,
title2) > 0.7)
- {
- idsToRemove.add(j); // dupes
found, later list member to
-
// be deleted
+ if (meas.measureStringDistance(title1,
title2) > 0.7) {
+ idsToRemove.add(j); // dupes
found, later list member to be deleted
}
}
for (int i = 0; i < hits.length; i++)
@@ -79,7 +80,6 @@ public class WebPageReviewExtractor extends WebPageExtractor {
public ReviewObj extractSentencesWithPotentialReviewPhrases(String url)
{
ReviewObj reviewObj = new ReviewObj();
int maxSentsFromPage= 20;
- List<String[]> results = new ArrayList<>();
String downloadedPage = pageFetcher.fetchPage(url, 20000);
if (downloadedPage == null || downloadedPage.length() < 100)
@@ -97,8 +97,7 @@ public class WebPageReviewExtractor extends WebPageExtractor {
continue;
item =
item.replace("<span>","").replace("</span>","").replace("<b>","").replace("</b>","");
if (item.length()>80 &&
MinedSentenceProcessor.acceptableMinedSentence(item)==null){
- // TODO OPENNLP-1454 Candidate for
logger.debug(...) if required/helpful
- // System.out.println("Rejected
sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+item);
+ LOG.debug("Rejected sentence by
GeneratedSentenceProcessor.acceptableMinedSentence = {}", item);
continue;
}
productFeaturesList .add(item);
@@ -109,7 +108,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
String startArea = StringUtils.substringBetween(pageOrigHTML,
"reviewHistoPop", "t of 5 stars");
String item = StringUtils.substringBetween(startArea,
"title=\"","ou" );
- if (item==null){//title="4.0 out of 5 stars" ><span>4.0 out of
5 stars</span>
+ if (item==null) { //title="4.0 out of 5 stars" ><span>4.0 out
of 5 stars</span>
int index = pageOrigHTML.indexOf("of 5 stars\"");
startArea = StringUtils.substringBetween(pageOrigHTML,
"of 5 stars\"", "of 5 stars");
item = StringUtils.substringBetween(startArea,
"<span>","ou" );
@@ -121,7 +120,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
float rating = Float.parseFloat(item);
reviewObj.setRating(rating);
} catch (NumberFormatException e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
//productFeaturesList .add(item);
@@ -130,7 +129,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
String[] sents = downloadedPage.split("#");
List<TextChunk> sentsList = new ArrayList<>();
- for(String s: sents){
+ for(String s: sents) {
s = s.trim().replace(" ", ". ").replace("..",
".").replace(". . .", " ")
.replace(": ", ". ").replace("- ", ".
").
replace (". .",".").trim();
@@ -140,7 +139,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
sentsList.sort(new TextChunkComparable());
String[] longestSents = new String[maxSentsFromPage];
int j=0;
// -1 removed
- for(int i=sentsList.size()-1 -maxSentsFromPage; i<
sentsList.size()&& j<longestSents.length; i++){
+ for (int i=sentsList.size()-1 -maxSentsFromPage; i<
sentsList.size()&& j<longestSents.length; i++) {
longestSents[j] = sentsList.get(i).text;
j++;
}
@@ -156,7 +155,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
}
private String[] verifyEnforceStartsUpperCase(String[] sents) {
- for(int i=0; i<sents.length; i++){
+ for (int i=0; i<sents.length; i++) {
String s = sents[i];
s = StringUtils.trim(s);
String sFirstChar = s.substring(0, 1);
@@ -170,7 +169,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
private List<String> cleanProductFeatures(List<String>
productFeaturesList) {
List<String> results = new ArrayList<>();
- for(String feature: productFeaturesList){
+ for (String feature: productFeaturesList) {
if (feature.startsWith("Unlimited Free") ||
feature.startsWith("View Larger") || feature.startsWith("View Larger") ||
feature.indexOf("shipping")>0)
continue;
results.add(feature);
@@ -178,8 +177,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
return results;
}
- protected String[] cleanListOfSents(String[] longestSents)
- {
+ protected String[] cleanListOfSents(String[] longestSents) {
float minFragmentLength = 40, minFragmentLengthSpace=4;
List<String> sentsClean = new ArrayList<>();
@@ -205,7 +203,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
// disused - Feb 26 13
//furtherSplit =
furtherMakeSentencesShorter(furtherSplit);
furtherSplit.remove(furtherSplit.size()-1);
- for(String s : furtherSplit){
+ for (String s : furtherSplit) {
if (s.indexOf('|')>-1)
continue;
s = s.replace("<em>"," ").replace("</em>"," ");
@@ -243,7 +241,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
}
} catch (Exception e) {
results.add(sent);
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
@@ -282,22 +280,23 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
reviewObjTotal.setSentimentPhrases(buf);
}
- /* buf = reviewObjTotal.getOriginalizedSentences();
- if (buf!=null && afterOriginalization!=null &&
afterOriginalization.length>0){
- List<String> b1 =
Arrays.asList(afterOriginalization);
- List<String> b2 = new
ArrayList<String>();
- b2.addAll(buf);
- b2.addAll(new ArrayList<String>(b1));
-
reviewObjTotal.setOriginalizedSentences(b2);
- }
-*/
+ /*
+ buf = reviewObjTotal.getOriginalizedSentences();
+ if (buf!=null &&
afterOriginalization!=null && afterOriginalization.length>0){
+ List<String> b1 =
Arrays.asList(afterOriginalization);
+ List<String> b2 = new
ArrayList<String>();
+ b2.addAll(buf);
+ b2.addAll(new
ArrayList<String>(b1));
+
reviewObjTotal.setOriginalizedSentences(b2);
+ }
+ */
}
if (reviewObjTotal==null) return new ArrayList<>();
List<String> textReviews =
buildManyReviewTexts(reviewObjTotal);
-
- /* String textReview = buildText(reviewObjTotal);
+ /*
+ String textReview = buildText(reviewObjTotal);
try {
if (textReview!=null && textReview.length()>60)
ser.saveReviewsToDB(textReview, bpid,
pagesForAProduct.get(0).getUrl(), pagesForAProduct.get(0).getTitle(),
@@ -308,7 +307,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
*/
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
return reviewObjTotal.getOriginalizedSentences();
}
@@ -374,7 +373,7 @@ public class WebPageReviewExtractor extends
WebPageExtractor {
if
(bufs[currentRevIndex].toString().split(".").length>4)
bufs[currentRevIndex].append("\n");
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
count++;
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java
index 81a6f40..d01fec2 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java
@@ -16,15 +16,14 @@
*/
package opennlp.tools.parse_thicket.apps;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
-import java.util.logging.Logger;
import org.apache.commons.lang.StringUtils;
-
import opennlp.tools.similarity.apps.ContentGeneratorSupport;
import opennlp.tools.similarity.apps.Fragment;
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
@@ -33,12 +32,12 @@ import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.TextProcessor;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class SnippetToParagraph extends ContentGeneratorSupport
/*RelatedSentenceFinder */{
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final PageFetcher pFetcher = new PageFetcher();
- private static final Logger LOG = Logger
-
.getLogger("com.become.parse_thicket.apps.SnippetToParagraph");
public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item)
{
@@ -102,7 +101,7 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
followSent = mainAndFollowSent[1];
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
else
// or get original snippet
@@ -124,13 +123,10 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
f.setSourceURL(item.getUrl());
f.fragment = fragment;
result.add(f);
- System.out.println("Accepted sentence: " +
pageSentenceProc
- + "| with title= " + title);
- System.out.println("For fragment = " +
fragment);
+ LOG.debug("Accepted sentence: {} | with title =
{}", pageSentenceProc, title);
+ LOG.debug("For fragment = {}", fragment);
} else
- System.out
- .println("Rejected sentence due to wrong area
at webpage: "
- + pageSentence);
+ LOG.debug("Rejected sentence due to wrong area
at webpage: {}", pageSentence);
}
@@ -166,21 +162,19 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
// try to find original sentence from webpage
try {
- String[] mainAndFollowSent =
getFullOriginalSentenceFromWebpageBySnippetFragment(
- f, sents);
+ String[] mainAndFollowSent =
getFullOriginalSentenceFromWebpageBySnippetFragment(f, sents);
pageSentence = mainAndFollowSent[0];
followSent = mainAndFollowSent[1];
if (pageSentence!=null)
result.add(pageSentence);
else {
result.add(f);
- LOG.info("Could not find the original
sentence \n"+f +"\n in the page " );
+ LOG.warn("Could not find the original
sentence \n {} \n in the page ", f);
}
//if (followSent !=null)
// result.add(followSent);
} catch (Exception e) {
-
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
item.setOriginalSentences(result);
@@ -197,25 +191,19 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
return sentsClean;
}
-
-
- private String[] removeDuplicates(String[] hits)
- {
+ private String[] removeDuplicates(String[] hits) {
StringDistanceMeasurer meas = new StringDistanceMeasurer();
List<Integer> idsToRemove = new ArrayList<>();
List<String> hitsDedup = new ArrayList<>();
- try
- {
+ try {
for (int i = 0; i < hits.length; i++)
- for (int j = i + 1; j < hits.length; j++)
- {
+ for (int j = i + 1; j < hits.length; j++) {
String title1 = hits[i];
String title2 = hits[j];
if (StringUtils.isEmpty(title1) ||
StringUtils.isEmpty(title2))
continue;
- if (meas.measureStringDistance(title1,
title2) > 0.7)
- {
+ if (meas.measureStringDistance(title1,
title2) > 0.7) {
idsToRemove.add(j); // dupes
found, later list member to
// be deleted
}
@@ -223,14 +211,12 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
for (int i = 0; i < hits.length; i++)
if (!idsToRemove.contains(i))
hitsDedup.add(hits[i]);
- if (hitsDedup.size() < hits.length)
- {
+ if (hitsDedup.size() < hits.length) {
System.out.println("Removed duplicates from
relevant search results, including "
+ hits[idsToRemove.get(0)]);
}
}
- catch (Exception e)
- {
+ catch (Exception e) {
System.out.println("Problem removing duplicates from
relevant images");
}
@@ -238,15 +224,13 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
}
- public String[] extractSentencesFromPage(String url)
- {
+ public String[] extractSentencesFromPage(String url) {
int maxSentsFromPage= 100;
List<String[]> results = new ArrayList<>();
String downloadedPage = pFetcher.fetchPage(url, 20000);
- if (downloadedPage == null || downloadedPage.length() < 100)
- {
+ if (downloadedPage == null || downloadedPage.length() < 100) {
return null;
}
@@ -286,13 +270,11 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
float minFragmentLength = 40, minFragmentLengthSpace=4;
List<String> sentsClean = new ArrayList<>();
- for (String sentenceOrMultSent : longestSents)
- {
+ for (String sentenceOrMultSent : longestSents) {
if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)
continue;
if
(GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("Rejected sentence by
GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
+ LOG.debug("Rejected sentence by
GeneratedSentenceProcessor.acceptableMinedSentence = {}", sentenceOrMultSent);
continue;
}
// aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n.
@@ -312,8 +294,8 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
// disused - Feb 26 13
//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
furtherSplit.remove(furtherSplit.size()-1);
- for(String s : furtherSplit){
- if (s.indexOf('|')>-1)
+ for(String s : furtherSplit) {
+ if (s.indexOf('|') >- 1)
continue;
s = s.replace("<em>"," ").replace("</em>"," ");
s = Utils.convertToASCII(s);
@@ -324,7 +306,7 @@ public class SnippetToParagraph extends
ContentGeneratorSupport /*RelatedSentenc
return sentsClean.toArray(new String[0]);
}
private String[] verifyEnforceStartsUpperCase(String[] sents) {
- for(int i=0; i<sents.length; i++){
+ for(int i=0; i<sents.length; i++) {
String s = sents[i];
s = StringUtils.trim(s);
String sFirstChar = s.substring(0, 1);
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java
index 1f1844a..94960e5 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java
@@ -16,6 +16,7 @@
*/
package opennlp.tools.parse_thicket.apps;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
@@ -26,18 +27,20 @@ import org.apache.commons.lang.StringUtils;
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.textsimilarity.TextProcessor;
-import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class WebPageExtractor {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
protected final PageFetcher pageFetcher = new PageFetcher();
- protected ParserChunker2MatcherProcessor nlProc;
protected final MostFrequentWordsFromPageGetter
mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();
protected static final int SENT_THRESHOLD_LENGTH = 70;
- public List<String[]>
extractSentencesWithPotentialProductKeywords(String url)
- {
+ public List<String[]>
extractSentencesWithPotentialProductKeywords(String url) {
int maxSentsFromPage= 20;
List<String[]> results = new ArrayList<>();
@@ -60,13 +63,11 @@ public class WebPageExtractor {
if (headerSections.length<2)
headerSections = pageOrigHTML.split("<h3");
for(String section: headerSections){
-
String header = StringUtils.substringBetween(section,
">", "<");
if (header!=null && header.length()>20)
pageTitles.add(header);
}
-
downloadedPage= downloadedPage.replace(" ", "&");
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
String[] sents = downloadedPage.split("#");
@@ -77,10 +78,8 @@ public class WebPageExtractor {
replace (". .",".").trim();
sentsList.add(new TextChunk(s, s.length()));
}
-
sentsList.sort(new TextChunkComparable());
-
String[] longestSents = new String[maxSentsFromPage];
int j=0;
for(int i=sentsList.size() -maxSentsFromPage; i<
sentsList.size(); i++){
@@ -100,22 +99,19 @@ public class WebPageExtractor {
return results;
}
- protected String[] cleanListOfSents(String[] longestSents)
- {
+ protected String[] cleanListOfSents(String[] longestSents) {
List<String> sentsClean = new ArrayList<>();
- for (String sentenceOrMultSent : longestSents)
- {
+ for (String sentenceOrMultSent : longestSents) {
List<String> furtherSplit =
TextProcessor.splitToSentences(sentenceOrMultSent);
- for(String s : furtherSplit){
+ for(String s : furtherSplit) {
if (s.replace('.','&').split("&").length>3)
continue;
if (s.indexOf('|')>-1)
continue;
if (s == null || s.trim().length() <
SENT_THRESHOLD_LENGTH || s.length() < SENT_THRESHOLD_LENGTH + 10)
continue;
- if
(GeneratedSentenceProcessor.acceptableMinedSentence(s)==null){
- // TODO OPENNLP-1454 Candidate for
logger.debug(...) if required/helpful
- // System.out.println("Rejected
sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+s);
+ if
(GeneratedSentenceProcessor.acceptableMinedSentence(s)==null) {
+ LOG.debug("Rejected sentence by
GeneratedSentenceProcessor.acceptableMinedSentence = {}", s);
continue;
}
sentsClean.add(s);
@@ -144,10 +140,9 @@ public class WebPageExtractor {
public static void main(String[] args){
WebPageExtractor extractor = new WebPageExtractor();
- List<String[]> res =
-
extractor.extractSentencesWithPotentialProductKeywords("http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
- System.out.println(res.get(1));
-
+ List<String[]> res =
extractor.extractSentencesWithPotentialProductKeywords(
+
"http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
+ LOG.info(Arrays.toString(res.get(1)));
}
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
index ff6c412..ebf44fb 100755
---
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
@@ -16,6 +16,7 @@
*/
package opennlp.tools.parse_thicket.pattern_structure;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@@ -23,17 +24,18 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
-import opennlp.tools.fca.ConceptLattice;
import opennlp.tools.textsimilarity.ParseTreeChunk;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class LinguisticPatternStructure extends PhrasePatternStructure {
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
public LinguisticPatternStructure(int objectCounts, int
attributeCounts) {
super(objectCounts, attributeCounts);
-
- ConceptLattice cl = null;
}
-
+
public void AddExtentToAncestors(LinkedHashSet<Integer>extent, int
curNode) {
//
if (conceptList.get(curNode).parents.size()>0){
@@ -41,20 +43,19 @@ public class LinguisticPatternStructure extends
PhrasePatternStructure {
conceptList.get(parent).addExtents(extent);
AddExtentToAncestors(extent, parent);
}
- }
+ }
}
-
+
public int AddIntent(List<List<ParseTreeChunk>> intent,
LinkedHashSet<Integer>extent,int generator) {
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("debug called for " + intent);
+ LOG.debug("debug called for {}", intent);
//printLattice();
int generator_tmp = GetMaximalConcept(intent, generator);
generator = generator_tmp;
if (conceptList.get(generator).intent.equals(intent)) {
- System.out.println("at generator:" +
conceptList.get(generator).intent);
- System.out.println("to add:" + intent);
- System.out.println("already generated");
- AddExtentToAncestors(extent, generator);
+ LOG.debug("at generator: {}",
conceptList.get(generator).intent);
+ LOG.debug("to add: {}", intent);
+ LOG.debug("already generated");
+ AddExtentToAncestors(extent, generator);
return generator;
}
Set<Integer> generatorParents =
conceptList.get(generator).parents;
@@ -67,13 +68,12 @@ public class LinguisticPatternStructure extends
PhrasePatternStructure {
new_extent.addAll(conceptList.get(candidate).extent);
new_extent.addAll(extent);
if (intent.size()!=intersection.size()){
- // TODO OPENNLP-1454 Candidate for
logger.debug(...) if required/helpful
- // System.out.println("recursive call
(inclusion)");
- // System.out.println(intent + "----" +
intersection);
+ LOG.debug("recursive call (inclusion)");
+ LOG.debug("{}----{}", intent,
intersection);
candidate =
AddIntent(intersection,new_extent, candidate);
}
}
-
+
boolean addParents = true;
// System.out.println("now iterating over parents");
Iterator<Integer> iterator = newParents.iterator();
@@ -93,8 +93,7 @@ public class LinguisticPatternStructure extends
PhrasePatternStructure {
newParents.add(candidate);
}
}
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("size of lattice: " + conceptList.size());
+ LOG.debug("size of lattice: {}", conceptList.size());
PhraseConcept newConcept = new PhraseConcept();
newConcept.setIntent(intent);
@@ -102,7 +101,7 @@ public class LinguisticPatternStructure extends
PhrasePatternStructure {
new_extent.addAll(conceptList.get(generator).extent);
new_extent.addAll(extent);
newConcept.addExtents(new_extent);
-
+
newConcept.setPosition(conceptList.size());
conceptList.add(newConcept);
conceptList.get(generator).parents.add(newConcept.position);
@@ -119,43 +118,42 @@ public class LinguisticPatternStructure extends
PhrasePatternStructure {
}
return newConcept.position;
}
-
+
public void printLatticeExtended() {
for (int i = 0; i < conceptList.size(); ++i) {
printConceptByPositionExtended(i);
}
}
-
+
public void printConceptByPositionExtended(int index) {
- System.out.println("Concept at position " + index);
+ LOG.debug("Concept at position {}", index);
conceptList.get(index).printConceptExtended();
}
-
-
- public int [][] toContext(int extentCardinality){
-
+
+ public int [][] toContext(int extentCardinality) {
+
int newAttrCount = conceptList.size();
ArrayList<PhraseConcept> cList = new ArrayList<>(conceptList);
boolean run = true;
- int k=0;
- while (run && k<conceptList.size()){
- if (conceptList.get(k).intent.size() == attributeCount){
+ int k = 0;
+ while (run && k<conceptList.size()) {
+ if (conceptList.get(k).intent.size() == attributeCount)
{
if (conceptList.get(k).extent.size() == 0)
for (Integer
i:conceptList.get(k).parents)
cList.remove(i);
cList.remove(k);
- run=false;
+ run = false;
}
else
- k+=1;
+ k+=1;
}
-
+
run = true;
k=0;
while (run && k<=newAttrCount){
if (cList.get(k).extent.size()==0)
k++;
- run = false;
+ run = false;
}
newAttrCount = cList.size();
Set<Integer> nodeExtend;
@@ -168,9 +166,7 @@ public class LinguisticPatternStructure extends
PhrasePatternStructure {
}
return binaryContext;
}
-
-
-
+
public void logStability(){
int min_delta, delta;
float sum;
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
index 86d9523..d910352 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
@@ -16,6 +16,7 @@
*/
package opennlp.tools.parse_thicket.pattern_structure;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@@ -25,8 +26,12 @@ import java.util.Set;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class PhrasePatternStructure {
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
final int objectCount;
final int attributeCount;
public final List<PhraseConcept> conceptList;
@@ -59,16 +64,16 @@ public class PhrasePatternStructure {
}
return Generator;
}
+
public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) {
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("debug called for " + intent);
+ LOG.debug("debug called for {}", intent);
//printLattice();
int generator_tmp = GetMaximalConcept(intent, generator);
generator = generator_tmp;
if (conceptList.get(generator).intent.equals(intent)) {
- System.out.println("at generator:" +
conceptList.get(generator).intent);
- System.out.println("to add:" + intent);
- System.out.println("already generated");
+ LOG.debug("at generator: {}",
conceptList.get(generator).intent);
+ LOG.debug("to add: {}", intent);
+ LOG.debug("already generated");
return generator;
}
Set<Integer> generatorParents =
conceptList.get(generator).parents;
@@ -81,16 +86,14 @@ public class PhrasePatternStructure {
//intersection.retainAll(intent);
List<List<ParseTreeChunk>> intersection = md
.matchTwoSentencesGroupedChunksDeterministic(intent,
conceptList.get(candidate).intent);
- // TODO OPENNLP-1454 Candidate for
logger.debug(...) if required/helpful
- // System.out.println("recursive call
(inclusion)");
+ LOG.debug("recursive call (inclusion)");
candidate = AddIntent(intersection, candidate);
}
boolean addParents = true;
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("now iterating over parents");
+ LOG.debug("now iterating over parents");
Iterator<Integer> iterator = newParents.iterator();
while (iterator.hasNext()) {
- Integer parent = iterator.next();
+ int parent = iterator.next();
if
(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent))
{
addParents = false;
break;
@@ -120,8 +123,7 @@ public class PhrasePatternStructure {
newParents.add(candidate);
}
}
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("size of lattice: " + conceptList.size());
+ LOG.debug("size of lattice: {}", conceptList.size());
PhraseConcept newConcept = new PhraseConcept();
newConcept.setIntent(intent);
newConcept.setPosition(conceptList.size());
@@ -137,10 +139,10 @@ public class PhrasePatternStructure {
}
public void printLatticeStats() {
- System.out.println("Lattice stats");
- System.out.println("max_object_index = " + objectCount);
- System.out.println("max_attribute_index = " + attributeCount);
- System.out.println("Current concept count = " +
conceptList.size());
+ LOG.info("Lattice stats:");
+ LOG.info("max_object_index = {}", objectCount);
+ LOG.info("max_attribute_index = {}", attributeCount);
+ LOG.info("Current concept count = {}", conceptList.size());
}
@@ -151,19 +153,18 @@ public class PhrasePatternStructure {
}
public void printConceptByPosition(int index) {
- System.out.println("Concept at position " + index);
+ LOG.debug("Concept at position {}", index);
conceptList.get(index).printConcept();
}
public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
List<List<ParseTreeNode>> phrs) {
List<List<ParseTreeChunk>> results = new ArrayList<>();
- List<ParseTreeChunk> nps = new ArrayList<>(), vps = new
ArrayList<>(),
- pps = new ArrayList<>();
- for(List<ParseTreeNode> ps:phrs){
+ List<ParseTreeChunk> nps = new ArrayList<>(), vps = new
ArrayList<>(), pps = new ArrayList<>();
+ for(List<ParseTreeNode> ps:phrs) {
ParseTreeChunk ch = convertNodeListIntoChunk(ps);
String ptype = ps.get(0).getPhraseType();
- System.out.println(ps);
+ LOG.debug(ps.toString());
if (ptype.equals("NP")){
nps.add(ch);
} else if (ptype.equals("VP")){
@@ -177,8 +178,8 @@ public class PhrasePatternStructure {
}
private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps)
{
- List<String> lemmas = new ArrayList<>(), poss = new
ArrayList<>();
- for(ParseTreeNode n: ps){
+ List<String> lemmas = new ArrayList<>(), poss = new
ArrayList<>();
+ for(ParseTreeNode n: ps) {
lemmas.add(n.getWord());
poss.add(n.getPos());
}
@@ -187,6 +188,5 @@ public class PhrasePatternStructure {
return ch;
}
-
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
index 00a6d33..33f387b 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
@@ -17,6 +17,7 @@
package opennlp.tools.similarity.apps;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -29,14 +30,18 @@ import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * This class does content generation by using web mining and syntactic
generalization to get sentences
+ * Generates content by using web mining and syntactic generalization to get
sentences
* from the web, convert and combine them in the form expected to be readable
by humans and
* not distinguishable from genuine content by search engines.
*/
public class ContentGenerator /*extends RelatedSentenceFinder*/ {
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
private final PageFetcher pFetcher = new PageFetcher();
private final ParserChunker2MatcherProcessor sm =
ParserChunker2MatcherProcessor.getInstance();
private final ParseTreeChunkListScorer parseTreeChunkListScorer = new
ParseTreeChunkListScorer();
@@ -250,7 +255,7 @@ public class ContentGenerator /*extends
RelatedSentenceFinder*/ {
mainAndFollowSent =
ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
// if the above gives null than try to match
all sentences from snippet fragment
if (mainAndFollowSent==null ||
mainAndFollowSent[0]==null){
@@ -259,7 +264,7 @@ public class ContentGenerator /*extends
RelatedSentenceFinder*/ {
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
else
@@ -302,8 +307,7 @@ public class ContentGenerator /*extends
RelatedSentenceFinder*/ {
}
syntScore =
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
- System.out.println(parseTreeChunk.listToString(match) +
" "
- + syntScore + "\n pre-processed sent =
'" + pageSentence);
+ LOG.debug("{} {}\n pre-processed sent = '{}'",
parseTreeChunk.listToString(match), syntScore, pageSentence);
if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { //
trying other sents
for (String currSent : sentsAll) {
@@ -339,21 +343,19 @@ public class ContentGenerator /*extends
RelatedSentenceFinder*/ {
pageSentenceProc =
Utils.convertToASCII(pageSentenceProc);
result = new Fragment(pageSentenceProc,
syntScore + measScore
- + mentalScore +
(double) pageSentenceProc.length()
- / (double) 50);
+ + mentalScore +
(double) pageSentenceProc.length() / (double) 50);
result.setSourceURL(item.getUrl());
result.fragment = fragment;
- System.out.println("Accepted sentence:
" + pageSentenceProc
- + "| with title= " +
title);
- System.out.println("For fragment = " +
fragment);
+ LOG.debug("Accepted sentence: {} |
with title = {}", pageSentenceProc, title);
+ LOG.debug("For fragment = {}",
fragment);
} else
- System.out.println("Rejected sentence
due to wrong area at webpage: " + pageSentence);
+ LOG.debug("Rejected sentence due to
wrong area at webpage: {}", pageSentence);
} else
- System.out.println("Rejected sentence due to
low score: " + pageSentence);
+ LOG.debug("Rejected sentence due to low score:
{}", pageSentence);
// }
} catch (Throwable t) {
- t.printStackTrace();
+ LOG.error(t.getLocalizedMessage(), t);
}
return result;
}
@@ -417,7 +419,7 @@ public class ContentGenerator /*extends
RelatedSentenceFinder*/ {
// hits.get(0).getTitle(), hits);
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
index 0575bbd..b846abf 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
@@ -17,10 +17,10 @@
package opennlp.tools.similarity.apps;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
-import java.util.logging.Logger;
import java.util.regex.Pattern;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
@@ -30,13 +30,14 @@ import opennlp.tools.textsimilarity.TextProcessor;
import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* This class supports content generation by static functions.
*/
public class ContentGeneratorSupport {
- private static final Logger LOG = Logger
-
.getLogger("opennlp.tools.similarity.apps.ContentGeneratorSupport");
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
//TODO - verify regexp!!
private static final Pattern SPACES_PATTERN =
Pattern.compile("([a-z])(\\s{2,3})([A-Z])");
@@ -165,9 +166,7 @@ public class ContentGeneratorSupport {
if (StringUtils.isEmpty(title1) ||
StringUtils.isEmpty(title2))
continue;
if (meas.measureStringDistance(title1,
title2) > dupeThresh) {
- idsToRemove.add(j); // dupes
found, later list member to
- // be deleted
-
+ idsToRemove.add(j); // dupes
found, later list member to be deleted
}
}
@@ -176,12 +175,11 @@ public class ContentGeneratorSupport {
hitsDedup.add(hits.get(i));
if (hitsDedup.size() < hits.size()) {
- LOG.info("Removed duplicates from formed query,
including "
- + hits.get(idsToRemove.get(0)));
+ LOG.info("Removed duplicates from formed query,
including {}", hits.get(idsToRemove.get(0)));
}
} catch (Exception e) {
- LOG.severe("Problem removing duplicates from query
list");
+ LOG.error("Problem removing duplicates from query
list", e);
}
return hitsDedup;
@@ -213,7 +211,7 @@ public class ContentGeneratorSupport {
continue;
if
(meas.measureStringDistance(sf1, sf2) > dupeThresh) {
fragmList2Results.remove(f2);
-
LOG.info("Removed duplicates from formed fragments list: " + sf2);
+
LOG.info("Removed duplicates from formed fragments list: {}", sf2);
}
}
@@ -221,7 +219,7 @@ public class ContentGeneratorSupport {
hits.set(j, hit2);
}
} catch (Exception e) {
- LOG.severe("Problem removing duplicates from list of
fragment");
+ LOG.error("Problem removing duplicates from list of
fragment", e);
}
return hits;
}
@@ -236,13 +234,13 @@ public class ContentGeneratorSupport {
return null;
StringDistanceMeasurer meas = new StringDistanceMeasurer();
- Double dist = 0.0;
+ double dist = 0.0;
String result = null, followSent = "";
for (int i = 0; i < sents.length; i++) {
String s = sents[i];
if (s == null || s.length() < 30)
continue;
- Double distCurr = meas.measureStringDistance(s,
fragment);
+ double distCurr = meas.measureStringDistance(s,
fragment);
if (distCurr > dist && distCurr > 0.4) {
result = s;
dist = distCurr;
@@ -261,7 +259,7 @@ public class ContentGeneratorSupport {
}
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
}
@@ -276,13 +274,13 @@ public class ContentGeneratorSupport {
return null;
int bestSentIndex = -1;
StringDistanceMeasurer meas = new StringDistanceMeasurer();
- Double distBest = 10.0; // + sup
+ double distBest = 10.0; // + sup
String result = null, followSent = null;
for (int i = 0; i < sents.length; i++) {
String s = sents[i];
if (s == null || s.length() < 30)
continue;
- Double distCurr = meas.measureStringDistance(s,
fragment);
+ double distCurr = meas.measureStringDistance(s,
fragment);
if (distCurr > distBest) {
distBest = distCurr;
bestSentIndex = i;
@@ -314,7 +312,7 @@ public class ContentGeneratorSupport {
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
String[] sents = downloadedPage.split("#");
List<TextChunk> sentsList = new ArrayList<>();
- for(String s: sents){
+ for (String s: sents) {
s =
ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);
sentsList.add(new TextChunk(s, s.length()));
}
@@ -325,7 +323,7 @@ public class ContentGeneratorSupport {
int initIndex = sentsList.size()-1 -maxSentsFromPage;
if (initIndex<0)
initIndex = 0;
- for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage
; i++){
+ for (int i=initIndex; i< sentsList.size() && j<maxSentsFromPage
; i++) {
longestSents[j] = sentsList.get(i).text;
j++;
}
@@ -359,13 +357,12 @@ public class ContentGeneratorSupport {
float minFragmentLength = 40, minFragmentLengthSpace=4;
List<String> sentsClean = new ArrayList<>();
- for (String sentenceOrMultSent : longestSents)
- {
+ for (String sentenceOrMultSent : longestSents) {
if (sentenceOrMultSent==null ||
sentenceOrMultSent.length()<20)
continue;
- if
(GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
- // TODO OPENNLP-1454 Candidate for
logger.debug(...) if required/helpful
- // System.out.println("Rejected sentence by
GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
+ if
(GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null) {
+ LOG.debug("Rejected sentence by
GeneratedSentenceProcessor.acceptableMinedSentence = {}",
+
sentenceOrMultSent);
continue;
}
// aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n.
@@ -385,7 +382,7 @@ public class ContentGeneratorSupport {
// disused - Feb 26 13
//furtherSplit =
furtherMakeSentencesShorter(furtherSplit);
furtherSplit.remove(furtherSplit.size()-1);
- for(String s : furtherSplit){
+ for(String s : furtherSplit) {
if (s.indexOf('|')>-1)
continue;
s = s.replace("<em>"," ").replace("</em>"," ");
@@ -400,12 +397,11 @@ public class ContentGeneratorSupport {
float minFragmentLength = 40, minFragmentLengthSpace=4;
List<String> sentsClean = new ArrayList<>();
- for (String sentenceOrMultSent : longestSents)
- {
+ for (String sentenceOrMultSent : longestSents) {
if (sentenceOrMultSent==null ||
sentenceOrMultSent.length()<minFragmentLength)
continue;
List<String> furtherSplit =
TextProcessor.splitToSentences(sentenceOrMultSent);
- for(String sentence: furtherSplit ){
+ for(String sentence: furtherSplit ) {
if (sentence==null || sentence.length()<20)
continue;
if
(GeneratedSentenceProcessor.acceptableMinedSentence(sentence)==null){
@@ -423,8 +419,6 @@ public class ContentGeneratorSupport {
if (
avgSentenceLengthInTextPortion<minFragmentLengthSpace)
continue;
-
-
// forced split by ',' somewhere in the middle
of sentence
// disused - Feb 26 13
//furtherSplit =
furtherMakeSentencesShorter(furtherSplit);
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
index e5384c0..d94c1c2 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
@@ -17,64 +17,69 @@
package opennlp.tools.similarity.apps;
+import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import opennlp.tools.similarity.apps.utils.Utils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class GeneratedSentenceProcessor {
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
private static final String[] OCCURS = new String[]{ "click here",
"wikipedia", "retrieved", "isbn",
- "http", "www.",
- "copyright", "advertise", "(accessed", "[edit]", "[citation
needed]",
- "site map", "email updates", "contact us", "rss feeds",
"cite this site",
- "operating hours", "last modified", "product catalog",
- "days per week", "leave a comment", "corporate information",
- "employment opportunities", "terms of use", "private policy",
"parental guidelines", "copyright policy", "ad choices",
- "about us", "about our ads", "privacy policy", "terms of
use",
- "click for", "photos",
- "find the latest",
- "terms of service",
- "clicking here",
- "skip to", "sidebar",
- "Tags:",
- "available online",
- "get online",
- "buy online",
- "not valid", "get discount",
- "official site",
- "this video",
- //"this book",
- "this product",
- "paperback", "hardcover",
- "audio cd",
- "related searches",
- "permission is granted",
- "[edit",
- "edit categories",
- "free license",
- "permission is granted",
- "under the terms",
- "rights reserved",
- "wikipedia",
- "recipient of", "this message",
- "mailing list", "purchase order",
- "mon-fri", "email us", "privacy pol", "back to top",
- "click here", "for details", "assistance?", "chat live",
- "free shipping", "company info", "satisfaction g", "contact
us",
- "menu.", "search.", "sign in", "home.",
- "additional terms", "may apply"};
+ "http", "www.",
+ "copyright", "advertise", "(accessed",
"[edit]", "[citation needed]",
+ "site map", "email updates", "contact
us", "rss feeds", "cite this site",
+ "operating hours", "last modified",
"product catalog",
+ "days per week", "leave a comment",
"corporate information",
+ "employment opportunities", "terms of
use", "private policy", "parental guidelines", "copyright policy", "ad
choices",
+ "about us", "about our ads", "privacy
policy", "terms of use",
+ "click for", "photos",
+ "find the latest",
+ "terms of service",
+ "clicking here",
+ "skip to", "sidebar",
+ "Tags:",
+ "available online",
+ "get online",
+ "buy online",
+ "not valid", "get discount",
+ "official site",
+ "this video",
+ //"this book",
+ "this product",
+ "paperback", "hardcover",
+ "audio cd",
+ "related searches",
+ "permission is granted",
+ "[edit",
+ "edit categories",
+ "free license",
+ "permission is granted",
+ "under the terms",
+ "rights reserved",
+ "wikipedia",
+ "recipient of", "this message",
+ "mailing list", "purchase order",
+ "mon-fri", "email us", "privacy pol",
"back to top",
+ "click here", "for details",
"assistance?", "chat live",
+ "free shipping", "company info",
"satisfaction g", "contact us",
+ "menu.", "search.", "sign in", "home.",
+ "additional terms", "may apply"};
private static final String[] OCCURS_STARTS_WITH = new String[]{
- "fax", "write","email", "contact", "conditions", "chat live",
- "we ", "the recipient", "day return", "days return",
- "refund it", "your money",
- "purchase orders",
- "exchange it ", "return it", "day return", "days return",
- "subscribe","posted by", "below" , "corporate",
- "this book"};
+ "fax", "write","email", "contact",
"conditions", "chat live",
+ "we ", "the recipient", "day return",
"days return",
+ "refund it", "your money",
+ "purchase orders",
+ "exchange it ", "return it", "day
return", "days return",
+ "subscribe","posted by", "below" ,
"corporate",
+ "this book"};
public static String acceptableMinedSentence(String sent) {
if (sent==null || sent.length()<40)
@@ -99,10 +104,10 @@ public class GeneratedSentenceProcessor {
System.out.println("Rejection: too many brakets in sent
='"+sent);
return null;
}
-
+
String[] pipes = StringUtils.split(sent, '|');
if (StringUtils.split(sent, '|').length > 2
- || StringUtils.split(sent, '>').length > 2) {
+ || StringUtils.split(sent,
'>').length > 2) {
//System.out.println("Rejection: too many |s or >s in
sent ='"+sent);
return null;
}
@@ -118,26 +123,26 @@ public class GeneratedSentenceProcessor {
// count symbols indicating wrong parts of page to mine for text
// if short and contains too many symbols indicating wrong
area: reject
String sentWrongSym = sentTry.replace(">", "&&&").replace("�",
"&&&")
- .replace("|", "&&&").replace(":",
"&&&").replace("/", "&&&")
- .replace("-", "&&&").replace("%", "&&&");
+ .replace("|",
"&&&").replace(":", "&&&").replace("/", "&&&")
+ .replace("-",
"&&&").replace("%", "&&&");
if ((sentWrongSym.length() - sentTry.length()) >= 4
- && sentTry.length() < 200) // twice ot more
+ && sentTry.length() < 200) //
twice ot more
return null;
sent = sent.replace('[', ' ').replace(']', ' ')
- .replace("_should_find_orig_", "").replace(".
.", ". ")
- .replace("amp;", " ").replace("1.", "
").replace("2.", " ")
- .replace("3.", " ").replace("4.", " ").
- /* .replace("2009", "2011")
- .replace("2008", "2011").replace("2006", "2011")
- .replace("2007", "2011").
- */ replace("VIDEO:", " ").replace("Video:", " ")
- .replace("no comments", " ").replace(" ", "
").replace(" ", " ")
- .replace("(more.)", "").replace("more.",
"").replace("<more>", "")
- .replace("[more]", "").replace(".,",
".").replace("<", "")
- .replace("p>", "").replace("product
description", "");
-
- //sent = sent.replace("Click here. ","").replace("Share
this:.","").replace("Facebook.","").
+ .replace("_should_find_orig_",
"").replace(". .", ". ")
+ .replace("amp;", "
").replace("1.", " ").replace("2.", " ")
+ .replace("3.", "
").replace("4.", " ").
+ /* .replace("2009", "2011")
+ .replace("2008", "2011").replace("2006", "2011")
+ .replace("2007", "2011").
+ */ replace("VIDEO:", " ").replace("Video:", " ")
+ .replace("no comments", "
").replace(" ", " ").replace(" ", " ")
+ .replace("(more.)",
"").replace("more.", "").replace("<more>", "")
+ .replace("[more]",
"").replace(".,", ".").replace("<", "")
+ .replace("p>",
"").replace("product description", "");
+
+ //sent = sent.replace("Click here. ","").replace("Share
this:.","").replace("Facebook.","").
// replace("Twitter." Email. Google. Print.
Tumblr. Pinterest. More. Digg. LinkedIn. StumbleUpon. Reddit. Like this: Like
Loading.. ")
// TODO .replace("a.", ".");
@@ -150,9 +155,8 @@ public class GeneratedSentenceProcessor {
}
public static String processSentence(String pageSentence) {
- if (acceptableMinedSentence(pageSentence)==null){
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("Rejected sentence by
GeneratedSentenceProcessor.processSentence.acceptableMinedSentence()");
+ if (acceptableMinedSentence(pageSentence)==null) {
+ LOG.debug("Rejected sentence via
processSentence().acceptableMinedSentence");
return "";
}
if (pageSentence == null)
@@ -165,7 +169,7 @@ public class GeneratedSentenceProcessor {
pageSentence = StringUtils.chomp(pageSentence, "...");
pageSentence = StringUtils.chomp(pageSentence, " ....");
pageSentence = pageSentence.replace("::", ":").replace(".,", ".
")
- .replace("(.)", "");
+ .replace("(.)", "");
pageSentence = pageSentence.trim();
pageSentence = pageSentence.replaceAll("\\s+", " "); // make
single
@@ -178,21 +182,20 @@ public class GeneratedSentenceProcessor {
// at the end
// after pipe
if (pipes.length == 2
- && ((float) pipes[0].length() / (float)
pipes[1].length() > 3.0)) {
+ && ((float) pipes[0].length() /
(float) pipes[1].length() > 3.0)) {
int pipePos = pageSentence.indexOf("|");
if (pipePos > -1)
pageSentence = pageSentence.substring(0,
pipePos - 1).trim();
-
}
if (!StringUtils.contains(pageSentence, '.')
- && !StringUtils.contains(pageSentence, '?')
- && !StringUtils.contains(pageSentence, '!'))
+ &&
!StringUtils.contains(pageSentence, '?')
+ &&
!StringUtils.contains(pageSentence, '!'))
pageSentence = pageSentence + ". ";
pageSentence = pageSentence.replace(" .", ".").replace("..",
".").trim();
- if (!pageSentence.endsWith(".") && !pageSentence.endsWith(":")
- &&!pageSentence.endsWith("!")
&&!pageSentence.endsWith("."))
+ if (!pageSentence.endsWith(".") && !pageSentence.endsWith(":")
+ &&!pageSentence.endsWith("!")
&&!pageSentence.endsWith("."))
pageSentence += ". ";
return pageSentence;
}
@@ -200,26 +203,26 @@ public class GeneratedSentenceProcessor {
public static boolean isProhibitiveWordsOccurOrStartWith(String
sentenceLowercase){
for(String o: OCCURS){
if (sentenceLowercase.contains(o)){
- //System.out.println("Found prohibited
occurrence "+ o +" \n in sentence = "+ sentenceLowercase);
+ LOG.debug("Found prohibited occurrence {} \n in
sentence = {}", o, sentenceLowercase);
return true;
}
}
for(String o: OCCURS_STARTS_WITH){
if (sentenceLowercase.startsWith(o)){
- //System.out.println("Found prohibited
occurrence Start With "+ o +" \n in sentence = "+ sentenceLowercase);
+ LOG.debug("Found prohibited occurrence (starts
with) {} \n in sentence = {}", o, sentenceLowercase);
return true;
}
}
// || sentTry.endsWith("the")
- // || sentTry.endsWith("the.") || sentTry.startsWith("below")
+ // || sentTry.endsWith("the.") || sentTry.startsWith("below")
return false;
}
public static void main(String[] args) {
-
+
String sentence = "Accepted sentence: Educational. Video. About
Us menu. Home. Nobel Prizes and Laureates. Nobel Prizes and Laureates. Physics
Prize. Chemistry Prize. Medicine Prize. Literature Prize. Peace Prize. Prize in
Economic Sciences. Quick Facts. Nomination. Nomination. Physics Prize.
Chemistry Prize. Medicine Prize. Literature Prize. Peace Prize. Prize in
Economic Sciences. Nomination Archive. Ceremonies. Ceremonies. Ceremony
Archive. Nobel Banquet Menus. Nobel Banquet Dress C [...]
-
+
String res =
GeneratedSentenceProcessor.acceptableMinedSentence(sentence);
String para;
para = "inventions of albert einstein
what was albert einsteins invention invention of
einstein what were albert einsteins inventions ";
@@ -249,69 +252,20 @@ public class GeneratedSentenceProcessor {
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
public static String normalizeForSentenceSplitting(String pageContent) {
pageContent.replace("Jan.", "January").replace("Feb.",
"February")
- .replace("Mar.", "March").replace("Apr.", "April")
- .replace("Jun.", "June").replace("Jul.", "July")
- .replace("Aug.", "August").replace("Sep.", "September")
- .replace("Oct.", "October").replace("Nov.", "November")
- .replace("Dec.", "December");
+ .replace("Mar.",
"March").replace("Apr.", "April")
+ .replace("Jun.",
"June").replace("Jul.", "July")
+ .replace("Aug.",
"August").replace("Sep.", "September")
+ .replace("Oct.",
"October").replace("Nov.", "November")
+ .replace("Dec.", "December");
return pageContent;
}
-}
-
-/*
-
-if (sentTry.indexOf("click here")>-1 || sentTry.indexOf(" wikip") > -1
-|| sentTry.indexOf("copyright") > -1
-|| sentTry.indexOf("operating hours") > -1
-|| sentTry.indexOf("days per week") > -1
-|| sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
-|| sentTry.indexOf("find the latest") > -1
-|| sentTry.startsWith("subscribe")
-|| sentTry.indexOf("Terms of Service") > -1
-|| sentTry.indexOf("clicking here") > -1
-|| sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
-|| sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
-|| sentTry.indexOf("available online") > -1
-|| sentTry.indexOf("get online") > -1
-|| sentTry.indexOf("buy online") > -1
-|| sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1
-|| sentTry.indexOf("official site") > -1
-|| sentTry.indexOf("this video") > -1
-|| sentTry.indexOf("this book") > -1
-|| sentTry.indexOf("this product") > -1
-|| sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1
-|| sentTry.indexOf("audio cd") > -1
-|| sentTry.indexOf("related searches") > -1
-|| sentTry.indexOf("permission is granted") > -1
-|| sentTry.indexOf("[edit") > -1
-|| sentTry.indexOf("edit categories") > -1
-|| sentTry.indexOf("free license") > -1
-|| sentTry.indexOf("permission is granted") > -1
-|| sentTry.indexOf("under the terms") > -1
-|| sentTry.indexOf("rights reserved") > -1
-|| sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")
-|| sentTry.endsWith("the.") || sentTry.startsWith("below")
-|| sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1
-||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1
-||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1
||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1
-||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1
||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1
-||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1
||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1
-||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") ||
sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1
||sentTry.indexOf( "chat live")>-1
-||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1
||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
-
-||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 ||
sentTry.indexOf( "your money")>-1
-||sentTry.indexOf( "cond???")>-1 ||sentTry.indexOf( "purchase orders")>-1
-||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1
||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
-)
-return null;
-
-*/
\ No newline at end of file
+}
\ No newline at end of file
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
index 80f02ed..209e29a 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
@@ -17,13 +17,13 @@
package opennlp.tools.similarity.apps;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-import java.util.logging.Logger;
import org.apache.commons.lang.StringUtils;
@@ -36,6 +36,8 @@ import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.TextProcessor;
import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Provides content generation by using web mining and syntactic
generalization to get sentences from the web,
@@ -47,7 +49,9 @@ import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcess
*
*/
public class RelatedSentenceFinder {
- private static final Logger LOG =
Logger.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
final PageFetcher pFetcher = new PageFetcher();
final ParserChunker2MatcherProcessor sm =
ParserChunker2MatcherProcessor.getInstance();
protected final ParseTreeChunkListScorer parseTreeChunkListScorer = new
ParseTreeChunkListScorer();
@@ -60,14 +64,12 @@ public class RelatedSentenceFinder {
protected final Set<String> visitedURLs = new HashSet<>();
// used to indicate that a sentence is an opinion, so more appropriate
- static final List<String> MENTAL_VERBS = new ArrayList<>(
- Arrays.asList("want", "know", "believe", "appeal",
"ask",
- "accept", "agree",
"allow", "appeal", "ask", "assume", "believe",
- "check", "confirm",
"convince", "deny", "disagree", "explain",
- "ignore", "inform",
"remind", "request", "suggest", "suppose",
- "think", "threaten",
"try", "understand"));
-
- private static final int MAX_FRAGMENT_SENTS = 10;
+ static final List<String> MENTAL_VERBS = Arrays.asList(
+ "want", "know", "believe", "appeal",
"ask",
+ "accept", "agree", "allow", "appeal",
"ask", "assume", "believe",
+ "check", "confirm", "convince", "deny",
"disagree", "explain",
+ "ignore", "inform", "remind",
"request", "suggest", "suppose",
+ "think", "threaten", "try",
"understand");
public RelatedSentenceFinder(int ms, int msr, float thresh, String key)
{
this.MAX_STEPS = ms;
@@ -75,7 +77,7 @@ public class RelatedSentenceFinder {
this.RELEVANCE_THRESHOLD=thresh;
yrunner.setKey(key);
}
-
+
int generateContentAboutIter = 0;
public RelatedSentenceFinder() {
@@ -86,6 +88,7 @@ public class RelatedSentenceFinder {
yrunner.setLang(lang);
}
+
public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String
word, List<String> sents) {
return yrunner.runSearch(word, 100);
@@ -104,12 +107,12 @@ public class RelatedSentenceFinder {
if (searchResult != null) {
for (HitBase item : searchResult) { // got some
text from .html
if (item.getAbstractText() != null
- &&
!(item.getUrl().indexOf(".pdf") > 0)) { // exclude
+ &&
!(item.getUrl().indexOf(".pdf") > 0)) { // exclude
// pdf
opinionSentencesToAdd
-
.add(augmentWithMinedSentencesAndVerifyRelevance(item,
- sentence,
sents));
-
+
.add(augmentWithMinedSentencesAndVerifyRelevance(item,
+
sentence, sents));
+
}
}
}
@@ -122,7 +125,7 @@ public class RelatedSentenceFinder {
* Main content generation function which takes a seed as a person, rock
* group, or other entity name and produce a list of text fragments by
web
* mining for <br>
- *
+ *
* @param sentence
* entity name
* @return List<HitBase> of text fragment structures which contain
approved
@@ -142,18 +145,18 @@ public class RelatedSentenceFinder {
int stepCount=0;
for (String verbAddition : extraKeywords) {
List<HitBase> searchResult = yrunner.runSearch(sentence
+ " "
- + verbAddition, MAX_SEARCH_RESULTS);
//100);
+ + verbAddition,
MAX_SEARCH_RESULTS); //100);
if (MAX_SEARCH_RESULTS<searchResult.size())
searchResult = searchResult.subList(0,
MAX_SEARCH_RESULTS);
//TODO for shorter run
if (searchResult != null) {
for (HitBase item : searchResult) { // got some
text from .html
if (item.getAbstractText() != null
- &&
!(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) {
// exclude pdf
+ &&
!(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) {
// exclude pdf
opinionSentencesToAdd
-
.add(//augmentWithMinedSentencesAndVerifyRelevance(item,
- // sentence,
null));
-
buildParagraphOfGeneratedText(item, sentence, null));
+
.add(//augmentWithMinedSentencesAndVerifyRelevance(item,
+
// sentence, null));
+
buildParagraphOfGeneratedText(item, sentence,
null));
visitedURLs.add(item.getUrl());
}
}
@@ -162,7 +165,7 @@ public class RelatedSentenceFinder {
if (stepCount>MAX_STEPS)
break;
}
-
+
// if nothing is written, then get first search result and try
again
try {
if (generateContentAboutIter<4 &&
ContentGeneratorSupport.problematicHitList(opinionSentencesToAdd)){
@@ -173,7 +176,7 @@ public class RelatedSentenceFinder {
opinionSentencesToAdd =
generateContentAbout(discoveredSimilarTopic);
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
return removeDuplicatesFromResultantHits(opinionSentencesToAdd);
@@ -185,7 +188,7 @@ public class RelatedSentenceFinder {
* to relevance assessment by Similarity. Search queries should not be
too
* general (irrelevant search results) or too specific (too few search
* results)
- *
+ *
* @param sentence
* input sentence to form queries
* @return List<String> of search expressions
@@ -201,16 +204,16 @@ public class RelatedSentenceFinder {
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
- ||
ch.getPOSs().get(i).startsWith("J")) {
+ ||
ch.getPOSs().get(i).startsWith("J")) {
query.append(ch.getLemmas().get(i)).append(" ");
}
}
query = new StringBuilder(query.toString().trim());
- int len = query.toString().split(" ").length;
+ int len = query.toString().split("\\s+").length;
if (len < 2 || len > 5)
continue;
if (len < 4) { // every word should start with capital
- String[] qs = query.toString().split(" ");
+ String[] qs = query.toString().split("\\s+");
boolean bAccept = true;
for (String w : qs) {
if (w.toLowerCase().equals(w)) // idf
only two words then
@@ -236,12 +239,12 @@ public class RelatedSentenceFinder {
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
- ||
ch.getPOSs().get(i).startsWith("J")) {
+ ||
ch.getPOSs().get(i).startsWith("J")) {
query.append(ch.getLemmas().get(i)).append(" ");
}
}
query = new
StringBuilder(query.toString().trim());
- int len = query.toString().split(" ").length;
+ int len = query.toString().split("\\s+").length;
if (len < 2)
continue;
@@ -263,7 +266,7 @@ public class RelatedSentenceFinder {
/**
* remove dupes from queries to easy cleaning dupes and repetitive
search
* afterwards
- *
+ *
* @param hits List<String> of sentences (search queries, or search
results
* abstracts, or titles
* @return List<String> of sentences where dupes are removed
@@ -293,12 +296,11 @@ public class RelatedSentenceFinder {
hitsDedup.add(hits.get(i));
if (hitsDedup.size() < hits.size()) {
- LOG.info("Removed duplicates from formed query,
including "
- + hits.get(idsToRemove.get(0)));
+ LOG.info("Removed duplicates from formed query,
including {}", hits.get(idsToRemove.get(0)));
}
} catch (Exception e) {
- LOG.severe("Problem removing duplicates from query
list");
+ LOG.error("Problem removing duplicates from query
list", e);
}
return hitsDedup;
@@ -307,14 +309,14 @@ public class RelatedSentenceFinder {
/**
* remove dupes from search results
- *
+ *
* @param hits List<HitBase> of search results objects
* @return List<String> of search results objects where dupes are
removed
*/
public static List<HitBase>
removeDuplicatesFromResultantHits(List<HitBase> hits) {
StringDistanceMeasurer meas = new StringDistanceMeasurer();
double dupeThresh = // 0.8; // if more similar, then considered
dupes was
- 0.7;
+ 0.7;
try {
for (int i = 0; i < hits.size(); i++)
for (int j = i + 1; j < hits.size(); j++) {
@@ -330,8 +332,7 @@ public class RelatedSentenceFinder {
continue;
if
(meas.measureStringDistance(sf1, sf2) > dupeThresh) {
fragmList2Results.remove(f2);
-
LOG.info("Removed duplicates from formed fragments list: "
-
+ sf2);
+
LOG.debug("Removed duplicates from formed fragments list: {}", sf2);
}
}
@@ -339,7 +340,7 @@ public class RelatedSentenceFinder {
hits.set(j, hit2);
}
} catch (Exception e) {
- LOG.severe("Problem removing duplicates from list of
fragment");
+ LOG.error("Problem removing duplicates from list of
fragment", e);
}
return hits;
}
@@ -348,7 +349,7 @@ public class RelatedSentenceFinder {
* Takes single search result for an entity which is the subject of the
essay
* to be written and forms essay sentences from the title, abstract, and
* possibly original page
- *
+ *
* @param item The HitBase search result
* @param originalSentence The seed for the essay to be written
* @param sentsAll
@@ -358,7 +359,7 @@ public class RelatedSentenceFinder {
*/
public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
- String originalSentence, List<String> sentsAll) {
+
String originalSentence, List<String> sentsAll) {
if (sentsAll == null)
sentsAll = new ArrayList<>();
// put orig sentence in structure
@@ -366,18 +367,18 @@ public class RelatedSentenceFinder {
origs.add(originalSentence);
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", "
").replace("</b>", " ")
- .replace(" ", " ").replace(" ", " ");
+ .replace(" ", " ").replace("
", " ");
// generation results for this sentence
List<Fragment> result = new ArrayList<>();
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ")
- .replace("</b>", " ").replace(" ", "
").replace(" ", " ");
+ .replace("</b>", " ").replace("
", " ").replace(" ", " ");
// fix a template expression which can be substituted by
original if
// relevant
String snapshotMarked = snapshot.replace("...",
- " _should_find_orig_ . _should_find_orig_");
+ " _should_find_orig_ .
_should_find_orig_");
String[] fragments = sm.splitSentences(snapshotMarked);
List<String> allFragms = new
ArrayList<>(Arrays.asList(fragments));
@@ -390,7 +391,7 @@ public class RelatedSentenceFinder {
item.setPageContent(downloadedPage);
String pageContent =
Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor
-
.normalizeForSentenceSplitting(pageContent);
+
.normalizeForSentenceSplitting(pageContent);
pageContent =
ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
//pageContent =
pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
// // ". ")
@@ -402,7 +403,7 @@ public class RelatedSentenceFinder {
}
}
} catch (Exception e) {
- System.err.println("Problem downloading the page and
splitting into sentences");
+ LOG.error("Problem downloading the page and splitting
into sentences", e);
return item;
}
@@ -413,23 +414,23 @@ public class RelatedSentenceFinder {
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.contains("_should_find_orig_") && sents !=
null && sents.length > 0){
- try {
+ try {
// first try sorted sentences from page
by length approach
String[] sentsSortedByLength =
extractSentencesFromPage(downloadedPage);
String[] mainAndFollowSent = null;
try {
mainAndFollowSent =
getFullOriginalSentenceFromWebpageBySnippetFragment(
-
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
} catch (Exception e) {
- e.printStackTrace();
+
LOG.error(e.getLocalizedMessage(), e);
}
// if the above gives null than try to
match all sentences from snippet fragment
if (mainAndFollowSent==null ||
mainAndFollowSent[0]==null){
mainAndFollowSent =
getFullOriginalSentenceFromWebpageBySnippetFragment(
-
fragment.replace("_should_find_orig_", ""), sents);
+
fragment.replace("_should_find_orig_", ""), sents);
}
-
+
if (mainAndFollowSent!=null ||
mainAndFollowSent[0]!=null){
pageSentence =
mainAndFollowSent[0];
for(int i = 1; i<
mainAndFollowSent.length; i++)
@@ -438,11 +439,9 @@ public class RelatedSentenceFinder {
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
- }
-
- else
+ } else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
@@ -459,48 +458,43 @@ public class RelatedSentenceFinder {
double measScore, syntScore,
mentalScore = 0.0;
SentencePairMatchResult matchRes =
sm.assessRelevance(pageSentence
- + " " + title,
originalSentence);
+ + " " +
title, originalSentence);
List<List<ParseTreeChunk>> match =
matchRes.getMatchResult();
if (!matchRes.isVerbExists() ||
matchRes.isImperativeVerb()) {
- System.out.println("Rejected
Sentence : No verb OR Yes imperative verb :" + pageSentence);
+ LOG.debug("Rejected Sentence :
No verb OR Yes imperative verb: {}", pageSentence);
continue;
}
- syntScore = parseTreeChunkListScorer
-
.getParseTreeChunkListScore(match);
-
System.out.println(parseTreeChunk.listToString(match) + " "
- + syntScore + "\n
pre-processed sent = '" + pageSentence);
+ syntScore =
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+ LOG.debug("{} {}\n pre-processed sent =
'{}'", parseTreeChunk.listToString(match), syntScore, pageSentence);
if (syntScore < RELEVANCE_THRESHOLD){
// 1.5) { // trying other sents
for (String currSent :
sentsAll) {
if
(currSent.startsWith(originalSentence))
continue;
- match =
sm.assessRelevance(currSent, pageSentence)
-
.getMatchResult();
- double syntScoreCurr =
parseTreeChunkListScorer
-
.getParseTreeChunkListScore(match);
+ match =
sm.assessRelevance(currSent, pageSentence).getMatchResult();
+ double syntScoreCurr =
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
if (syntScoreCurr >
syntScore) {
syntScore =
syntScoreCurr;
}
}
if (syntScore >
RELEVANCE_THRESHOLD) {
- System.out.println("Got
match with other sent: "
- +
parseTreeChunk.listToString(match) + " " + syntScore);
+ LOG.debug("Got match
with other sent: {} {}", parseTreeChunk.listToString(match), syntScore);
}
}
measScore =
STRING_DISTANCE_MEASURER.measureStringDistance(
- originalSentence,
pageSentence);
+
originalSentence, pageSentence);
if ((syntScore > RELEVANCE_THRESHOLD ||
measScore > 0.5)
- && measScore < 0.8 &&
pageSentence.length() > 40) // >70
+ &&
measScore < 0.8 && pageSentence.length() > 40) // >70
{
String pageSentenceProc =
GeneratedSentenceProcessor
-
.acceptableMinedSentence(pageSentence);
+
.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null) {
pageSentenceProc =
GeneratedSentenceProcessor
-
.processSentence(pageSentenceProc);
+
.processSentence(pageSentenceProc);
followSent = new
StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString()));
if (followSent != null)
{
pageSentenceProc += " "+ followSent;
@@ -508,21 +502,20 @@ public class RelatedSentenceFinder {
pageSentenceProc =
Utils.convertToASCII(pageSentenceProc);
Fragment f = new
Fragment(pageSentenceProc, syntScore + measScore
- +
mentalScore + (double) pageSentenceProc.length()
- /
(double) 50);
+
+ mentalScore + (double) pageSentenceProc.length()
+
/ (double) 50);
f.setSourceURL(item.getUrl());
f.fragment = fragment;
result.add(f);
-
System.out.println("Accepted sentence: " + pageSentenceProc + " | "+followSent
- + "|
with title= " + title);
- System.out.println("For
fragment = " + fragment);
+ LOG.debug("Accepted
sentence: {} | {} | with title = {}", pageSentenceProc, followSent, title);
+ LOG.debug("For fragment
= {}", fragment);
} else
-
System.out.println("Rejected sentence due to wrong area at webpage: " +
pageSentence);
+ LOG.debug("Rejected
sentence due to wrong area at webpage: {}", pageSentence);
} else
- System.out.println("Rejected
sentence due to low score: " + pageSentence);
+ LOG.debug("Rejected sentence
due to low score: {}", pageSentence);
// }
} catch (Throwable t) {
- t.printStackTrace();
+ LOG.error(t.getLocalizedMessage(), t);
}
}
}
@@ -530,12 +523,11 @@ public class RelatedSentenceFinder {
return item;
}
-
// given a fragment from snippet, finds an original sentence at a
webpage by
// optimizing alignment score
public static String[]
getFullOriginalSentenceFromWebpageBySnippetFragment(
- String fragment, String[] sents) {
+ String fragment, String[] sents) {
if (fragment.trim().length() < 15)
return null;
@@ -552,7 +544,7 @@ public class RelatedSentenceFinder {
result = s;
dist = distCurr;
try {
- if (i < sents.length - 1 && sents[i +
1].length() > 60) {
+ if (i < sents.length - 1 && sents[i +
1].length() > 60) {
String f1 =
GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);
if (f1!=null){
followSent = new
StringBuilder(f1);
@@ -566,7 +558,7 @@ public class RelatedSentenceFinder {
}
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
}
@@ -576,7 +568,7 @@ public class RelatedSentenceFinder {
// given a fragment from snippet, finds an original sentence at a
webpage by
// optimizing alignmemt score
public static String[]
getBestFullOriginalSentenceFromWebpageBySnippetFragment(
- String fragment, String[] sents) {
+ String fragment, String[] sents) {
if (fragment.trim().length() < 15)
return null;
int bestSentIndex = -1;
@@ -598,7 +590,7 @@ public class RelatedSentenceFinder {
result = sents[bestSentIndex];
if (bestSentIndex < sents.length - 1
- && sents[bestSentIndex + 1].length() >
60) {
+ && sents[bestSentIndex
+ 1].length() > 60) {
followSent = sents[bestSentIndex + 1];
}
@@ -607,8 +599,7 @@ public class RelatedSentenceFinder {
return new String[] { result, followSent };
}
- public String[] extractSentencesFromPage(String downloadedPage)
- {
+ public String[] extractSentencesFromPage(String downloadedPage) {
int maxSentsFromPage= 100;
@@ -618,7 +609,7 @@ public class RelatedSentenceFinder {
List<TextChunk> sentsList = new ArrayList<>();
for(String s: sents){
s =
ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);
- /* s = s.trim().replace(" ", ". ").replace("..",
".").replace(". . .", " ")
+ /* s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", "
")
.replace(": ", ". ").replace("- ", ".
").
replace (". .",".").trim(); */
sentsList.add(new TextChunk(s, s.length()));
@@ -648,6 +639,7 @@ public class RelatedSentenceFinder {
this.text = s;
this.len = length;
}
+
public final String text;
public final int len;
}
@@ -663,8 +655,7 @@ public class RelatedSentenceFinder {
float minFragmentLength = 40, minFragmentLengthSpace=4;
List<String> sentsClean = new ArrayList<>();
- for (String sentenceOrMultSent : longestSents)
- {
+ for (String sentenceOrMultSent : longestSents) {
if (sentenceOrMultSent==null ||
sentenceOrMultSent.length()<20)
continue;
if
(GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
@@ -705,16 +696,16 @@ public class RelatedSentenceFinder {
origs.add(originalSentence);
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", "
").replace("</b>", " ")
- .replace(" ", " ").replace(" ", " ");
+ .replace(" ", " ").replace("
", " ");
// generation results for this sentence
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ")
- .replace("</b>", " ").replace(" ", "
").replace(" ", " ");
+ .replace("</b>", " ").replace("
", " ").replace(" ", " ");
// fix a template expression which can be substituted by
original if
// relevant
String snapshotMarked = snapshot.replace("...",
- " _should_find_orig_ . _should_find_orig_");
+ " _should_find_orig_ .
_should_find_orig_");
String[] fragments = sm.splitSentences(snapshotMarked);
List<String> allFragms = new
ArrayList<>(Arrays.asList(fragments));
@@ -727,7 +718,7 @@ public class RelatedSentenceFinder {
item.setPageContent(downloadedPage);
String pageContent =
Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor
-
.normalizeForSentenceSplitting(pageContent);
+
.normalizeForSentenceSplitting(pageContent);
pageContent =
ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
//pageContent =
pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
// // ". ")
@@ -740,7 +731,7 @@ public class RelatedSentenceFinder {
}
}
} catch (Exception e) {
- System.err.println("Problem downloading the page and
splitting into sentences");
+ LOG.error("Problem downloading the page and splitting
into sentences", e);
return new Triple<>(allFragms, downloadedPage, sents);
}
return new Triple<>(allFragms, downloadedPage, sents);
@@ -757,30 +748,26 @@ public class RelatedSentenceFinder {
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.contains("_should_find_orig_") && sents != null
- && sents.length > 0){
- try {
+ && sents.length > 0){
+ try {
// first try sorted sentences from page by
length approach
String[] sentsSortedByLength =
extractSentencesFromPage(downloadedPage);
-
try {
mainAndFollowSent =
getFullOriginalSentenceFromWebpageBySnippetFragment(
-
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
+
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
// if the above gives null than try to match
all sentences from snippet fragment
if (mainAndFollowSent==null ||
mainAndFollowSent[0]==null){
mainAndFollowSent =
getFullOriginalSentenceFromWebpageBySnippetFragment(
-
fragment.replace("_should_find_orig_", ""), sents);
+
fragment.replace("_should_find_orig_", ""), sents);
}
-
-
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
- }
- else
+ } else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
@@ -788,11 +775,11 @@ public class RelatedSentenceFinder {
return mainAndFollowSent;
- }
+ }
private Fragment verifyCandidateSentencesAndFormParagraph(
- String[] candidateSentences, HitBase item, String
fragment, String originalSentence, List<String> sentsAll) {
- Fragment result = null;
+ String[] candidateSentences, HitBase
item, String fragment, String originalSentence, List<String> sentsAll) {
+ Fragment result = null;
String pageSentence = candidateSentences[0];
StringBuilder followSent = new StringBuilder();
@@ -803,9 +790,9 @@ public class RelatedSentenceFinder {
// resultant sentence SHOULD NOT be longer than for times the
size of
// snippet fragment
if (!(pageSentence != null && pageSentence.length()>50) ){
- System.out.println("Cannot accept the sentence = "+
pageSentence +
- "!(pageSentence != null &&
pageSentence.length()>50 && (float) pageSentence.length() / (float)
fragment.length() < 4.0) )");
-
+ LOG.debug("Cannot accept the sentence = "+ pageSentence
+
+ "!(pageSentence != null
&& pageSentence.length()>50 && (float) pageSentence.length() / (float)
fragment.length() < 4.0) )");
+
return null;
}
@@ -815,60 +802,52 @@ public class RelatedSentenceFinder {
double measScore, syntScore, mentalScore = 0.0;
SentencePairMatchResult matchRes =
sm.assessRelevance(pageSentence
- + " " + title, originalSentence);
+ + " " + title,
originalSentence);
List<List<ParseTreeChunk>> match =
matchRes.getMatchResult();
if (match==null || match.size()<1){
- System.out
- .println("Rejected Sentence : empty match "+
pageSentence);
+ LOG.debug("Rejected Sentence : empty match {}",
pageSentence);
return null;
}
-
+
if (!matchRes.isVerbExists() ||
matchRes.isImperativeVerb()) {
- System.out
- .println("Rejected Sentence : No verb OR Yes
imperative verb :"
- + pageSentence);
+ LOG.debug("Rejected Sentence : No verb OR Yes
imperative verb: {}", pageSentence);
return null;
}
- syntScore = parseTreeChunkListScorer
- .getParseTreeChunkListScore(match);
- System.out.println(parseTreeChunk.listToString(match) +
" "
- + syntScore + "\n pre-processed sent =
'" + pageSentence);
+ syntScore =
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+ LOG.debug("{} {}\n pre-processed sent =
'{}'",parseTreeChunk.listToString(match), syntScore, pageSentence);
try {
if (sentsAll!=null && syntScore <
RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
for (String currSent : sentsAll) {
if
(currSent.startsWith(originalSentence))
continue;
- match =
sm.assessRelevance(currSent, pageSentence)
-
.getMatchResult();
- double syntScoreCurr =
parseTreeChunkListScorer
-
.getParseTreeChunkListScore(match);
+ match =
sm.assessRelevance(currSent, pageSentence).getMatchResult();
+ double syntScoreCurr =
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
if (syntScoreCurr > syntScore) {
syntScore =
syntScoreCurr;
}
}
if (syntScore > RELEVANCE_THRESHOLD) {
- System.out.println("Got match
with other sent: "
- +
parseTreeChunk.listToString(match) + " " + syntScore);
+ LOG.debug("Got match with other
sent: {} {}", parseTreeChunk.listToString(match), syntScore);
}
}
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
measScore =
STRING_DISTANCE_MEASURER.measureStringDistance(
- originalSentence, pageSentence);
+ originalSentence,
pageSentence);
if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
- && measScore < 0.8 &&
pageSentence.length() > 40) // >70
+ && measScore < 0.8 &&
pageSentence.length() > 40) // >70
{
String pageSentenceProc =
GeneratedSentenceProcessor
-
.acceptableMinedSentence(pageSentence);
+
.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null) {
pageSentenceProc =
GeneratedSentenceProcessor
-
.processSentence(pageSentenceProc);
+
.processSentence(pageSentenceProc);
followSent = new
StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString()));
if (followSent != null) {
pageSentenceProc += " "+
followSent;
@@ -876,31 +855,26 @@ public class RelatedSentenceFinder {
pageSentenceProc =
Utils.convertToASCII(pageSentenceProc);
result = new Fragment(pageSentenceProc,
syntScore + measScore
- + mentalScore +
(double) pageSentenceProc.length()
- / (double) 50);
+ +
mentalScore + (double) pageSentenceProc.length() / (double) 50);
result.setSourceURL(item.getUrl());
result.fragment = fragment;
- System.out.println("Accepted sentence:
" + pageSentenceProc
- + "| with title= " +
title);
- System.out.println("For fragment = " +
fragment);
+ LOG.debug("Accepted sentence: {} | with
title = {}", pageSentenceProc, title);
+ LOG.debug("For fragment = {}",
fragment);
} else
- System.out
- .println("Rejected sentence due to
wrong area at webpage: "
- + pageSentence);
+ LOG.debug("Rejected sentence due to
wrong area at webpage: {}", pageSentence);
} else
- System.out.println("Rejected sentence due to
low score: "
- + pageSentence);
+ LOG.debug("Rejected sentence due to low score:
{}", pageSentence);
// }
} catch (Throwable t) {
- t.printStackTrace();
+ LOG.error(t.getLocalizedMessage(), t);
}
return result;
}
public HitBase buildParagraphOfGeneratedText(HitBase item,
- String originalSentence, List<String> sentsAll) {
+
String originalSentence, List<String> sentsAll) {
List<Fragment> results = new ArrayList<>() ;
Triple<List<String>, String, String[]>
fragmentExtractionResults = formCandidateFragmentsForPage(item,
originalSentence, sentsAll);
@@ -932,20 +906,20 @@ public class RelatedSentenceFinder {
// uncomment the sentence you would like to serve as a
seed sentence for
// content generation for an event description
hits = f.generateContentAbout("Albert Einstein"
- // "Britney Spears - The Femme Fatale
Tour"
- // "Rush Time Machine",
- // "Blue Man Group" ,
- // "Belly Dance With Zaharah",
- // "Hollander Musicology Lecture:
Danielle Fosler-Lussier, Guest Lecturer",
- // "Jazz Master and arguably the most
famous jazz musician alive, trumpeter Wynton Marsalis",
- );
- System.out.println(HitBase.toString(hits));
- System.out.println(HitBase.toResultantString(hits));
+ // "Britney Spears -
The Femme Fatale Tour"
+ // "Rush Time Machine",
+ // "Blue Man Group" ,
+ // "Belly Dance With
Zaharah",
+ // "Hollander
Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
+ // "Jazz Master and
arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
+ );
+ LOG.info(HitBase.toString(hits));
+ LOG.info(HitBase.toResultantString(hits));
// WordFileGenerator.createWordDoc("Essey about Albert
Einstein",
// hits.get(0).getTitle(), hits);
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getLocalizedMessage(), e);
}
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
index 533f8e2..13c24f6 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
@@ -16,19 +16,21 @@
*/
package opennlp.tools.similarity.apps;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.logging.Logger;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class SpeechRecognitionResultsProcessor /*extends BingWebQueryRunner*/ {
- private static final Logger LOG =
-
Logger.getLogger("opennlp.tools.similarity.apps.SpeechRecognitionResultsProcessor");
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
private final ParseTreeChunkListScorer parseTreeChunkListScorer = new
ParseTreeChunkListScorer();
private ParserChunker2MatcherProcessor sm;
private final WebSearchEngineResultsScraper scraper = new
WebSearchEngineResultsScraper();
@@ -71,8 +73,7 @@ public class SpeechRecognitionResultsProcessor /*extends
BingWebQueryRunner*/ {
}
*/
} catch (Exception e) {
- LOG.severe("Problem processing snapshot " + snapshot);
- e.printStackTrace();
+ LOG.error("Problem processing snapshot " + snapshot, e);
}
totalMatchScore += score;
@@ -104,15 +105,13 @@ public class SpeechRecognitionResultsProcessor /*extends
BingWebQueryRunner*/ {
try {
List<HitBase> resultList = scraper.runSearch(sentence);
double scoreForSentence = calculateTotalMatchScoreForHits(resultList,
sentence);
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println("Total meaningfulness score = " +
scoreForSentence + " for sentence = " + sentence);
+ LOG.debug("Total meaningfulness score = {} for sentence = {}",
scoreForSentence, sentence);
if (scoreForSentence > bestSentScore) {
bestSentScore = scoreForSentence;
}
res.add(new SentenceMeaningfullnessScore(sentence, scoreForSentence));
} catch (Exception e) {
- LOG.warning("No search results for query '" + sentence);
- e.printStackTrace();
+ LOG.warn("No search results for query '{}' - reason: {}", sentence,
e.getLocalizedMessage());
return null;
}
}
@@ -129,6 +128,7 @@ public class SpeechRecognitionResultsProcessor /*extends
BingWebQueryRunner*/ {
score = sc;
}
+ @Override
public String toString() {
return "Total meaningfulness score = " + score + " for sentence = "
+ sentence + "\n";
@@ -141,29 +141,29 @@ public class SpeechRecognitionResultsProcessor /*extends
BingWebQueryRunner*/ {
public static void main(String[] args) {
SpeechRecognitionResultsProcessor proc = new
SpeechRecognitionResultsProcessor();
- proc.runSearchAndScoreMeaningfulness(Arrays.asList(new String[] {
- "meeting with alex at you for not to come over to 8 pm",
- "meeting with alex at you for not to come over to eat",
- "meeting with alex at il fornaio tomorrow to 8 pm" }));
-
- proc.runSearchAndScoreMeaningfulness(Arrays.asList(new String[] {
- "remember to buy milk tomorrow for details",
- "remember to buy milk tomorrow from trader joes",
- "remember to buy milk tomorrow from 3 to jones",
- "remember to buy milk tomorrow for for details",
- "remember to buy milk tomorrow from third to joes",
- "remember to buy milk tomorrow from third to jones",
- "remember to buy milk tomorrow from for d jones" }));
-
- proc.runSearchAndScoreMeaningfulness(Arrays.asList(new String[] {
- "I'm off tomorrow to shop at trader joes",
- "number to get milk tomorrow trader joes",
- "number 2 finals tomorrow from trader joes",
- "number 2 finals tomorrow trader joes",
- "number to buy move tomorrow from trader joes",
- "number to buy move tomorrow trader joes",
- "define move tomorrow from trader joes",
- "define move tomorrow trader joes", }));
+ proc.runSearchAndScoreMeaningfulness(Arrays.asList(
+ "meeting with alex at you for not to come over to 8 pm",
+ "meeting with alex at you for not to come over to eat",
+ "meeting with alex at il fornaio tomorrow to 8 pm"));
+
+ proc.runSearchAndScoreMeaningfulness(Arrays.asList(
+ "remember to buy milk tomorrow for details",
+ "remember to buy milk tomorrow from trader joes",
+ "remember to buy milk tomorrow from 3 to jones",
+ "remember to buy milk tomorrow for for details",
+ "remember to buy milk tomorrow from third to joes",
+ "remember to buy milk tomorrow from third to jones",
+ "remember to buy milk tomorrow from for d jones"));
+
+ proc.runSearchAndScoreMeaningfulness(Arrays.asList(
+ "I'm off tomorrow to shop at trader joes",
+ "number to get milk tomorrow trader joes",
+ "number 2 finals tomorrow from trader joes",
+ "number 2 finals tomorrow trader joes",
+ "number to buy move tomorrow from trader joes",
+ "number to buy move tomorrow trader joes",
+ "define move tomorrow from trader joes",
+ "define move tomorrow trader joes"));
}
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
index feebd63..700582c 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/GeneralizationListReducer.java
@@ -17,15 +17,20 @@
package opennlp.tools.textsimilarity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class GeneralizationListReducer {
- public List<ParseTreeChunk> applyFilteringBySubsumption_OLD(
- List<ParseTreeChunk> result) {
- List<ParseTreeChunk> resultDupl = new ArrayList<>(new HashSet<>(result));
- result = resultDupl;
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ public List<ParseTreeChunk>
applyFilteringBySubsumption_OLD(List<ParseTreeChunk> result) {
+ result = new ArrayList<>(new HashSet<>(result));
if (result.size() < 2)
return result; // nothing to reduce
List<ParseTreeChunk> resultReduced = new ArrayList<>();
@@ -45,8 +50,7 @@ public class GeneralizationListReducer {
}
if (resultReduced.size() < 1) {
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if required/helpful
- // System.err.println("Wrong subsumption reduction");
+ LOG.debug("Wrong subsumption reduction");
}
if (resultReduced.size() > 1) {
@@ -57,8 +61,7 @@ public class GeneralizationListReducer {
}
- public List<ParseTreeChunk> applyFilteringBySubsumptionOLD(
- List<ParseTreeChunk> result) {
+ public List<ParseTreeChunk>
applyFilteringBySubsumptionOLD(List<ParseTreeChunk> result) {
List<ParseTreeChunk> resultDupl;
if (result.size() < 2)
return result; // nothing to reduce
@@ -86,8 +89,7 @@ public class GeneralizationListReducer {
}
resultReduced = resultDupl;
if (resultReduced.size() < 1) {
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if required/helpful
- // System.err.println("Wrong subsumption reduction");
+ LOG.debug("Wrong subsumption reduction");
}
if (resultReduced.size() > 1) {
@@ -98,8 +100,7 @@ public class GeneralizationListReducer {
}
- public List<ParseTreeChunk> applyFilteringBySubsumption(
- List<ParseTreeChunk> result) {
+ public List<ParseTreeChunk> applyFilteringBySubsumption(List<ParseTreeChunk>
result) {
List<Integer> resultDuplIndex = new ArrayList<>();
List<ParseTreeChunk> resultReduced = new ArrayList<>();
@@ -135,8 +136,7 @@ public class GeneralizationListReducer {
}
if (resultReduced.size() < 1) {
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if required/helpful
- // System.err.println("Wrong subsumption reduction");
+ LOG.debug("Wrong subsumption reduction");
resultReduced = result;
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
index 1ebc613..90e28b4 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunkListScorer.java
@@ -17,20 +17,24 @@
package opennlp.tools.textsimilarity;
+import java.lang.invoke.MethodHandles;
import java.util.List;
import opennlp.tools.parse_thicket.matching.LemmaGeneralizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class ParseTreeChunkListScorer {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
// find the single expression with the highest score
- public double getParseTreeChunkListScore(
- List<List<ParseTreeChunk>> matchResult) {
+ public double getParseTreeChunkListScore(List<List<ParseTreeChunk>>
matchResult) {
double currScore = 0.0;
for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult)
for (ParseTreeChunk chunk : chunksGivenPhraseType) {
double score = getScore(chunk);
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println(chunk+ " => score >>> "+score);
+ LOG.debug("{} => score >>> {}", chunk, score);
if (score > currScore) {
currScore = score;
}
@@ -39,15 +43,13 @@ public class ParseTreeChunkListScorer {
}
// get max score per phrase type and then sum up
- public double getParseTreeChunkListScoreAggregPhraseType(
- List<List<ParseTreeChunk>> matchResult) {
+ public double
getParseTreeChunkListScoreAggregPhraseType(List<List<ParseTreeChunk>>
matchResult) {
double currScoreTotal = 0.0;
for (List<ParseTreeChunk> chunksGivenPhraseType : matchResult) {
double currScorePT = 0.0;
for (ParseTreeChunk chunk : chunksGivenPhraseType) {
double score = getScore(chunk);
- // TODO OPENNLP-1454 Candidate for logger.debug(...) if
required/helpful
- // System.out.println(chunk+ " => score >>> "+score);
+ LOG.debug("{} => score >>> {}", chunk, score);
if (score > currScorePT) {
currScorePT = score;
}
@@ -78,15 +80,12 @@ public class ParseTreeChunkListScorer {
}
} else if (l.startsWith(LemmaGeneralizer.W2V_PREFIX) ){
try {
- float val =
Float.parseFloat(l.substring(LemmaGeneralizer.W2V_PREFIX.length()));
- score+= 1- val;
- } catch (NumberFormatException e) {
- e.printStackTrace();
- }
- }
-
- else {
-
+ float val =
Float.parseFloat(l.substring(LemmaGeneralizer.W2V_PREFIX.length()));
+ score+= 1- val;
+ } catch (NumberFormatException e) {
+ LOG.error(e.getLocalizedMessage(), e);
+ }
+ } else {
if (pos.startsWith("NN") || pos.startsWith("NP")
|| pos.startsWith("CD") || pos.startsWith("RB")) {
score += 1.0;
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
index 3dc6b30..22dc78b 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
@@ -18,16 +18,16 @@
package opennlp.tools.textsimilarity.chunker2matcher;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.logging.Level;
-import java.util.logging.Logger;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
@@ -52,9 +52,13 @@ import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class ParserChunker2MatcherProcessor {
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
static final int MIN_SENTENCE_LENGTH = 10;
private static final String MODEL_DIR_KEY = "nlp.models.dir";
// TODO config
@@ -69,8 +73,6 @@ public class ParserChunker2MatcherProcessor {
private Parser parser;
private ChunkerME chunker;
private static final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
- private static final Logger LOG =
-
Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor");
private Map<String, String[][]> sentence_parseObject;
public SentenceDetector getSentenceDetector() {
@@ -100,11 +102,10 @@ public class ParserChunker2MatcherProcessor {
@SuppressWarnings("unchecked")
protected ParserChunker2MatcherProcessor() {
try {
- sentence_parseObject = (Map<String, String[][]>) ParserCacheSerializer
- .readObject();
+ sentence_parseObject = (Map<String, String[][]>)
ParserCacheSerializer.readObject();
} catch (Exception e) {
// this file might not exist initially
- LOG.warning("parsing cache file does not exist (but should be
created)");
+ LOG.warn("parsing cache file does not exist (but should be created)");
sentence_parseObject = new HashMap<>();
}
if (sentence_parseObject == null)
@@ -124,8 +125,8 @@ public class ParserChunker2MatcherProcessor {
initializeParser();
initializeChunker();
} catch (Exception e) { // a typical error when 'model' is not installed
- LOG.warning("The model can't be read and we rely on cache");
- System.err.println("Please install OpenNLP model files in
'src/test/resources' (folder 'model'");
+ LOG.warn("The model can't be read and we rely on cache");
+ LOG.warn("Please put OpenNLP model files in 'src/test/resources' (folder
'model')");
}
}
@@ -213,8 +214,7 @@ public class ParserChunker2MatcherProcessor {
return parseSentenceNlp(sentence, true);
}
- public synchronized Parse parseSentenceNlp(String sentence,
- boolean normalizeText) {
+ public synchronized Parse parseSentenceNlp(String sentence, boolean
normalizeText) {
// don't try to parse very short sentence, not much info in it anyway,
// most likely a heading
if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
@@ -224,7 +224,7 @@ public class ParserChunker2MatcherProcessor {
try {
parseArray = ParserTool.parseLine(sentence, parser, 1);
} catch (Throwable t) {
- LOG.log(Level.WARNING, "failed to parse the sentence : '" + sentence);
//, t);
+ LOG.warn("failed to parse the sentence : '{}'", sentence);
return null;
}
// there should be only one result parse
@@ -243,8 +243,7 @@ public class ParserChunker2MatcherProcessor {
* (noun, verb etc.)
*/
- public synchronized List<List<ParseTreeChunk>>
formGroupedPhrasesFromChunksForPara(
- String para) {
+ public synchronized List<List<ParseTreeChunk>>
formGroupedPhrasesFromChunksForPara(String para) {
List<List<ParseTreeChunk>> listOfChunksAccum = new ArrayList<>();
String[] sentences = splitSentences(para);
for (String sent : sentences) {
@@ -282,25 +281,24 @@ public class ParserChunker2MatcherProcessor {
String[] tags; // posTagger.tag(toks);
SentenceNode node = parseSentenceNode(sentence);
if (node == null) {
- LOG.info("Problem parsing sentence '" + sentence);
+ LOG.warn("Problem parsing sentence '{}'", sentence);
return null;
}
List<String> POSlist = node.getOrderedPOSList();
tags = POSlist.toArray(new String[0]);
if (toks.length != tags.length) {
- LOG.finest("disagreement between toks and tags; sent = '" + sentence
- + "'\n tags = " + tags
- + "\n will now try this sentence in lower case");
+ LOG.trace("disagreement between toks and tags; sent = '{}'\n" +
+ "{} \n will now try this sentence in lower case", sentence,
tags);
node = parseSentenceNode(sentence.toLowerCase());
if (node == null) {
- LOG.finest("Problem parsing sentence '" + sentence);
+ LOG.warn("Problem parsing sentence '{}'", sentence);
return null;
}
POSlist = node.getOrderedPOSList();
tags = POSlist.toArray(new String[0]);
if (toks.length != tags.length) {
- LOG.finest("AGAIN: disagreement between toks and tags for lower case!
");
+ LOG.trace("AGAIN: disagreement between toks and tags for lower case!");
if (toks.length > tags.length) {
String[] newToks = new String[tags.length];
for (int i = 0; i < tags.length; i++) {
@@ -489,20 +487,18 @@ public class ParserChunker2MatcherProcessor {
return null;
}
- public static List<SentenceNode> paragraphToSentenceNodes(
- List<Parse> paragraphParses) {
+ public static List<SentenceNode> paragraphToSentenceNodes(List<Parse>
paragraphParses) {
if (paragraphParses == null || paragraphParses.size() == 0)
return null;
- List<SentenceNode> paragraphNodes = new ArrayList<>(
- paragraphParses.size());
+ List<SentenceNode> paragraphNodes = new
ArrayList<>(paragraphParses.size());
for (Parse sentenceParse : paragraphParses) {
SentenceNode sentenceNode;
try {
sentenceNode = sentenceToSentenceNode(sentenceParse);
} catch (Exception e) {
// don't fail the whole paragraph when a single sentence fails
- LOG.severe("Failed to convert sentence to node. error: " + e);
+ LOG.warn("Failed to convert sentence to node. error: {}",
e.getLocalizedMessage());
sentenceNode = null;
}
@@ -527,8 +523,7 @@ public class ParserChunker2MatcherProcessor {
if (node instanceof SentenceNode)
return (SentenceNode) node;
else if (node instanceof PhraseNode) {
- SentenceNode sn = new SentenceNode("sentence", node.getChildren());
- return sn;
+ return new SentenceNode("sentence", node.getChildren());
} else
return null;
}
@@ -581,51 +576,53 @@ public class ParserChunker2MatcherProcessor {
}
protected void initializeSentenceDetector() {
- try (InputStream is = new FileInputStream(MODEL_DIR + "/en-sent.bin")) {
+ try (InputStream is = new BufferedInputStream(new
FileInputStream(MODEL_DIR + "/en-sent.bin"))) {
SentenceModel model = new SentenceModel(is);
sentenceDetector = new SentenceDetectorME(model);
} catch (IOException e) {
- e.printStackTrace();
- }
+ // we swallow exception to support the cached run
+ LOG.debug(e.getLocalizedMessage(), e);
+ }
}
protected void initializeTokenizer() {
- try (InputStream is = new FileInputStream(MODEL_DIR + "/en-token.bin")) {
+ try (InputStream is = new BufferedInputStream(new
FileInputStream(MODEL_DIR + "/en-token.bin"))) {
TokenizerModel model = new TokenizerModel(is);
tokenizer = new TokenizerME(model);
} catch (IOException e) {
// we swallow exception to support the cached run
+ LOG.debug(e.getLocalizedMessage(), e);
}
- // we swallow exception to support the cached run
}
protected void initializePosTagger() {
- try (InputStream is = new FileInputStream(MODEL_DIR +
"/en-pos-maxent.bin")) {
+ try (InputStream is = new BufferedInputStream(new
FileInputStream(MODEL_DIR + "/en-pos-maxent.bin"))) {
POSModel model = new POSModel(is);
posTagger = new POSTaggerME(model);
} catch (IOException e) {
// we swallow exception to support the cached run
+ LOG.debug(e.getLocalizedMessage(), e);
}
}
protected void initializeParser() {
- try (InputStream is = new FileInputStream(MODEL_DIR +
"/en-parser-chunking.bin")) {
+ try (InputStream is = new BufferedInputStream(new
FileInputStream(MODEL_DIR + "/en-parser-chunking.bin"))) {
ParserModel model = new ParserModel(is);
parser = ParserFactory.create(model);
} catch (IOException e) {
- //e.printStackTrace();
+ // we swallow exception to support the cached run
+ LOG.debug(e.getLocalizedMessage(), e);
}
- // we swallow exception to support the cached run
}
private void initializeChunker() {
- try (InputStream is = new FileInputStream(MODEL_DIR + "/en-chunker.bin")) {
+ try (InputStream is = new BufferedInputStream(new
FileInputStream(MODEL_DIR + "/en-chunker.bin"))) {
ChunkerModel model = new ChunkerModel(is);
chunker = new ChunkerME(model);
} catch (IOException e) {
- //e.printStackTrace();
+ // we swallow exception to support the cached run
+ LOG.debug(e.getLocalizedMessage(), e);
}
- // we swallow exception to support the cached run
}
/**
diff --git
a/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessorTest.java
b/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessorTest.java
index 295aafc..deda185 100644
---
a/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessorTest.java
+++
b/opennlp-similarity/src/test/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessorTest.java
@@ -17,6 +17,7 @@
*/
package opennlp.tools.textsimilarity.chunker2matcher;
+import java.lang.invoke.MethodHandles;
import java.util.List;
import org.junit.jupiter.api.AfterEach;
@@ -26,12 +27,16 @@ import org.junit.jupiter.api.Test;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.TextSimilarityBagOfWords;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
class ParserChunker2MatcherProcessorTest {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final TextSimilarityBagOfWords parserBOW = new
TextSimilarityBagOfWords();
private final ParseTreeChunkListScorer parseTreeChunkListScorer = new
ParseTreeChunkListScorer();
@@ -98,7 +103,7 @@ class ParserChunker2MatcherProcessorTest {
+ "This car has an amazingly good engine. "
+ "This car provides you a very good mileage.";
- System.out.println(parser.assessRelevance(phrase1,
phrase2).getMatchResult());
+ LOG.debug(parser.assessRelevance(phrase1,
phrase2).getMatchResult().toString());
}
@@ -115,7 +120,7 @@ class ParserChunker2MatcherProcessorTest {
double matchScore =
parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult);
double bagOfWordsScore = parserBOW.assessRelevanceAndGetScore(phrase1,
phrase2);
assertTrue(matchScore + 2 < bagOfWordsScore);
- System.out.println("MatchScore is adequate ( = " + matchScore
+ LOG.debug("MatchScore is adequate ( = " + matchScore
+ ") and bagOfWordsScore = " + bagOfWordsScore + " is too high");
// we now demonstrate how similarity can be captured by POS and cannot be
@@ -129,7 +134,7 @@ class ParserChunker2MatcherProcessorTest {
matchScore =
parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult);
bagOfWordsScore = parserBOW.assessRelevanceAndGetScore(phrase1, phrase2);
assertTrue(matchScore > 2 * bagOfWordsScore);
- System.out.println("MatchScore is adequate ( = " + matchScore + ") and
bagOfWordsScore = " + bagOfWordsScore + " is too low");
+ LOG.debug("MatchScore is adequate ( = " + matchScore + ") and
bagOfWordsScore = " + bagOfWordsScore + " is too low");
}
}
diff --git a/pom.xml b/pom.xml
index 7149ef7..6494209 100644
--- a/pom.xml
+++ b/pom.xml
@@ -120,6 +120,9 @@
<opennlp.tools.version>2.1.1</opennlp.tools.version>
<opennlp.forkCount>1.0C</opennlp.forkCount>
+ <slf4j.version>1.7.36</slf4j.version>
+ <log4j2.version>2.20.0</log4j2.version>
+
<uimaj.version>3.3.1</uimaj.version>
<jersey-client.version>2.39.1</jersey-client.version>
<jersey-server.version>2.39.1</jersey-server.version>
@@ -151,6 +154,12 @@
<type>test-jar</type>
</dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
@@ -187,6 +196,25 @@
<version>1.9.4</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-api</artifactId>
+ <version>${log4j2.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <version>${log4j2.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j-impl</artifactId>
+ <version>${log4j2.version}</version>
+ <scope>test</scope>
+ </dependency>
+
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>