[opennlp-sandbox] 01/01: enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path

mawiesne Mon, 17 Apr 2023 01:28:23 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch 
enhance_split_for_space_operations_to_cover_more_cases
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


commit 54f3ae1a54d6dfd7f9232b20583bd5787fa367b6
Author: Martin Wiesner <[email protected]>
AuthorDate: Mon Apr 17 10:28:09 2023 +0200

    enhances existing code in the `summarizer` component to use "\\s+" as split 
pattern instead of simply using a regular whitespace (" ").
    improves JavaDoc along the path
    improves formatting along the path
---
 .../src/main/java/opennlp/summarization/Score.java |  6 +--
 .../main/java/opennlp/summarization/Sentence.java  |  4 +-
 .../WordRelationshipDetermination.java             | 44 ++++++++++------------
 .../preprocess/DefaultDocProcessor.java            |  2 +-
 .../opennlp/summarization/textrank/TextRank.java   | 12 +++---
 5 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java 
b/summarizer/src/main/java/opennlp/summarization/Score.java
index 2fc2977..eeda3e7 100755
--- a/summarizer/src/main/java/opennlp/summarization/Score.java
+++ b/summarizer/src/main/java/opennlp/summarization/Score.java
@@ -18,11 +18,11 @@
 package opennlp.summarization;
 
 /**
- * A utility class to store the score of a sentence for ranking sentences 
within a document.
+ * Stores the score of a sentence for ranking sentences within a document.
  */
 public class Score implements Comparable<Score> {
-  int sentId;
-  public double score;
+  private int sentId;
+  private double score;
 
   public Score()
   {
diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java 
b/summarizer/src/main/java/opennlp/summarization/Sentence.java
index 07079b2..a0a96c8 100755
--- a/summarizer/src/main/java/opennlp/summarization/Sentence.java
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -93,7 +93,7 @@ public class Sentence {
        private int calcWrdCnt(String stringVal2) {
                int ret = 0;
                StopWords sw = StopWords.getInstance();
-               String[] wrds = stringVal.split(" ");
+               String[] wrds = stringVal.split("\\s+");
                for(String wrd: wrds){
                        
if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?"))
                                ret++;
@@ -131,7 +131,7 @@ public class Sentence {
        
        public int getWordCnt()
        {
-               return wordCnt==0? this.getStringVal().split(" ").length: 
wordCnt;
+               return wordCnt==0? this.getStringVal().split("\\s+").length: 
wordCnt;
        }
 
        // Should add an article id to the sentence class. For now returns true 
if the ids are the same.
diff --git 
a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
 
b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
index 524b420..eb960d0 100644
--- 
a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
+++ 
b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
@@ -35,11 +35,11 @@ import edu.mit.jwi.RAMDictionary;
 
 /**
  * Uses wordnet to determine the relation of two words.
- * Words have -
+ * Words have:
  * <ul>
- * <li>strong relationship: same word</li>
+ * <li>Strong relationship: same word</li>
  * <li>Med relationship: synonym, hyponym</li>
- * <li>weak relationship: antonym, hypernym</li>
+ * <li>Weak relationship: antonym, hypernym</li>
  * <li>No relationship: otherwise</li>
  * </ul>
  */
@@ -54,7 +54,7 @@ public class WordRelationshipDetermination {
   private final Hashtable<ISynsetID, ISynset> cache = new Hashtable<>();
   private final Hashtable<ISynset, List<IWord>> synsetWordCache = new 
Hashtable<>();
 
-  public WordRelationshipDetermination() throws Exception {
+  public WordRelationshipDetermination() {
     dictionary = new 
RAMDictionary(WordRelationshipDetermination.class.getResource(DICTIONARY_FILE), 
ILoadPolicy.IMMEDIATE_LOAD);
     ((RAMDictionary)dictionary).load();
     openDict();
@@ -76,9 +76,9 @@ public class WordRelationshipDetermination {
 
       //Get the synset in which word is present.
       ISynset wordSynset;
-      if(ww.synonyms!=null)
+      if (ww.synonyms!=null)
         wordSynset = ww.synonyms;
-      else{
+      else {
         IWord word = dictionary.getWord((IWordID)w.getID());
         wordSynset = word.getSynset();
         ww.synonyms = wordSynset;
@@ -89,11 +89,11 @@ public class WordRelationshipDetermination {
     }
     return ret;
   }
+
   /*
    * Returns true if the word represented by idxNoun is present in a synset.
    */
-  private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun)
-  {
+  private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun) {
     IWord ret = null;
     List<IWord> wrds;
 
@@ -105,12 +105,9 @@ public class WordRelationshipDetermination {
 //             }
 
     //Returns all the words present in the synset wordSynset
-    for(IWord synonym : wrds)
-    {
-      for(IWordID nounID : idxNoun.getWordIDs())
-      {
-        if(synonym.equals(dictionary.getWord(nounID)))
-        {
+    for(IWord synonym : wrds) {
+      for(IWordID nounID : idxNoun.getWordIDs()) {
+        if(synonym.equals(dictionary.getWord(nounID))) {
           ret = synonym;
           break;
         }
@@ -140,16 +137,15 @@ public class WordRelationshipDetermination {
     ISynset wordSynset = word.getSynset();
 
     for(Pointer p : rels) {
-
       List<ISynsetID> rels;
-      if(ww.rels.get(p)!=null)
+      if (ww.rels.get(p)!=null)
         rels = ww.rels.get(p);
       else {
         rels = wordSynset.getRelatedSynsets(p);
         ww.rels.put(p, rels);
       }
 
-      for(ISynsetID id: rels) {
+      for (ISynsetID id: rels) {
         ISynset s = this.dictionary.getSynset(id);
         IWord mat = inSynset(s, idxNoun);
         if(mat!=null)
@@ -174,7 +170,7 @@ public class WordRelationshipDetermination {
   public WordRelation getRelation(LexicalChain l, String noun, boolean 
checkMed) {
     WordRelation ret = new WordRelation();
     ret.relation = WordRelation.NO_RELATION;
-    for(Word w : l.word) {
+    for (Word w : l.word) {
       //Exact match is a string relation.
       if(w.getLexicon().equalsIgnoreCase(noun)) {
         ret.relation = WordRelation.STRONG_RELATION;
@@ -185,8 +181,7 @@ public class WordRelationshipDetermination {
       //  else it is a Wordnet word and is it a synonym or hyponym of LCs 
(medium relation)
       else if(w.getID()!=null && checkMed){
         Word wrel = isMediumRel(noun, w) ;
-        if(wrel!=null)
-        {
+        if(wrel!=null) {
           ret.relation = WordRelation.MED_RELATION;
           ret.src = w;
           ret.dest = wrel;
@@ -205,20 +200,19 @@ public class WordRelationshipDetermination {
         e.printStackTrace();
       }
   }
+
   public List<Word> getWordSenses(String noun) {
     List<Word> ret = new ArrayList<>();
-    try{
+    try {
       //               openDict();
       List<IWordID> wordIDs = this.dictionary.getIndexWord(noun, 
POS.NOUN).getWordIDs();
-      for(IWordID wid: wordIDs)
-      {
+      for(IWordID wid: wordIDs) {
         Word w = new WordnetWord();
         w.setLexicon(noun);
         w.setID(wid);
         ret.add(w);
       }
-    }catch(Exception ex){
-      // ex.printStackTrace();
+    } catch(Exception ex){
       //Not in dictionary
       Word w = new WordnetWord();
       w.setLexicon(noun);
diff --git 
a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
 
b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
index f4e1a0e..e491aec 100755
--- 
a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
+++ 
b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -208,7 +208,7 @@ public class DefaultDocProcessor implements DocProcessor {
   @Override
   public String[] getWords(String sent)
   {
-    return sent.split(" ");
+    return sent.trim().split("\\s+");
   }
 
   @Override
diff --git 
a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java 
b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
index b6072eb..f4b5470 100755
--- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
+++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
@@ -75,8 +75,8 @@ public class TextRank {
   // words..
   public double getWeightedSimilarity(String sent1, String sent2,
                                       Hashtable<String, Double> wrdWts) {
-    String[] words1 = sent1.split(" ");
-    String[] words2 = sent2.split(" ");
+    String[] words1 = sent1.trim().split("\\s+");
+    String[] words2 = sent2.trim().split("\\s+");
     double wordsInCommon = 0;
     Hashtable<String, Boolean> dups = new Hashtable<>();
     for (String s : words1) {
@@ -173,8 +173,7 @@ public class TextRank {
 
     for (int i = 0; i < sentences.size(); i++) {
       String nextSent = sentences.get(i);
-      String[] words = nextSent.split(" ");
-      List<Integer> processed = new ArrayList<>();
+      String[] words = nextSent.trim().split("\\s+");
       Score s = new Score();
       s.setSentId(i);
 
@@ -185,6 +184,7 @@ public class TextRank {
         if (otherSents == null)
           continue;
 
+        List<Integer> processed = new ArrayList<>();
         for (int idx : otherSents) {
           if (idx != i && !processed.contains(idx)) {
             double currS = getWeightedSimilarity(sentences.get(i),
@@ -233,7 +233,7 @@ public class TextRank {
 
     if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
       String sent = getSentences().get(0);
-      String[] wrds = sent.split(" ");
+      String[] wrds = sent.trim().split("\\s+");
       for (String wrd : wrds)
         wrdWts.put(wrd, TITLE_WRD_WT);
     }
@@ -278,7 +278,7 @@ public class TextRank {
 
 /*
  * public double getScore(String sent1, String sent2, boolean toPrint) {
- * String[] words1 = sent1.split(" "); String[] words2 = sent2.split(" ");
+ * String[] words1 = sent1.split("\\s+"); String[] words2 = 
sent2.split("\\s+");
  * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
  * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
  * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { 
wordsInCommon+=

[opennlp-sandbox] 01/01: enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path

Reply via email to