This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch enhance_split_for_space_operations_to_cover_more_cases in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 54f3ae1a54d6dfd7f9232b20583bd5787fa367b6 Author: Martin Wiesner <[email protected]> AuthorDate: Mon Apr 17 10:28:09 2023 +0200 enhances existing code in the `summarizer` component to use "\\s+" as split pattern instead of simply using a regular whitespace (" "). improves JavaDoc along the path improves formatting along the path --- .../src/main/java/opennlp/summarization/Score.java | 6 +-- .../main/java/opennlp/summarization/Sentence.java | 4 +- .../WordRelationshipDetermination.java | 44 ++++++++++------------ .../preprocess/DefaultDocProcessor.java | 2 +- .../opennlp/summarization/textrank/TextRank.java | 12 +++--- 5 files changed, 31 insertions(+), 37 deletions(-) diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java b/summarizer/src/main/java/opennlp/summarization/Score.java index 2fc2977..eeda3e7 100755 --- a/summarizer/src/main/java/opennlp/summarization/Score.java +++ b/summarizer/src/main/java/opennlp/summarization/Score.java @@ -18,11 +18,11 @@ package opennlp.summarization; /** - * A utility class to store the score of a sentence for ranking sentences within a document. + * Stores the score of a sentence for ranking sentences within a document. */ public class Score implements Comparable<Score> { - int sentId; - public double score; + private int sentId; + private double score; public Score() { diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java index 07079b2..a0a96c8 100755 --- a/summarizer/src/main/java/opennlp/summarization/Sentence.java +++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java @@ -93,7 +93,7 @@ public class Sentence { private int calcWrdCnt(String stringVal2) { int ret = 0; StopWords sw = StopWords.getInstance(); - String[] wrds = stringVal.split(" "); + String[] wrds = stringVal.split("\\s+"); for(String wrd: wrds){ if(!sw.isStopWord(wrd)&&!wrd.startsWith("'")&&!wrd.equals(".")&&!wrd.equals("?")) ret++; @@ -131,7 +131,7 @@ public class Sentence { public int getWordCnt() { - return wordCnt==0? this.getStringVal().split(" ").length: wordCnt; + return wordCnt==0? this.getStringVal().split("\\s+").length: wordCnt; } // Should add an article id to the sentence class. For now returns true if the ids are the same. diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java index 524b420..eb960d0 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java @@ -35,11 +35,11 @@ import edu.mit.jwi.RAMDictionary; /** * Uses wordnet to determine the relation of two words. - * Words have - + * Words have: * <ul> - * <li>strong relationship: same word</li> + * <li>Strong relationship: same word</li> * <li>Med relationship: synonym, hyponym</li> - * <li>weak relationship: antonym, hypernym</li> + * <li>Weak relationship: antonym, hypernym</li> * <li>No relationship: otherwise</li> * </ul> */ @@ -54,7 +54,7 @@ public class WordRelationshipDetermination { private final Hashtable<ISynsetID, ISynset> cache = new Hashtable<>(); private final Hashtable<ISynset, List<IWord>> synsetWordCache = new Hashtable<>(); - public WordRelationshipDetermination() throws Exception { + public WordRelationshipDetermination() { dictionary = new RAMDictionary(WordRelationshipDetermination.class.getResource(DICTIONARY_FILE), ILoadPolicy.IMMEDIATE_LOAD); ((RAMDictionary)dictionary).load(); openDict(); @@ -76,9 +76,9 @@ public class WordRelationshipDetermination { //Get the synset in which word is present. ISynset wordSynset; - if(ww.synonyms!=null) + if (ww.synonyms!=null) wordSynset = ww.synonyms; - else{ + else { IWord word = dictionary.getWord((IWordID)w.getID()); wordSynset = word.getSynset(); ww.synonyms = wordSynset; @@ -89,11 +89,11 @@ public class WordRelationshipDetermination { } return ret; } + /* * Returns true if the word represented by idxNoun is present in a synset. */ - private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun) - { + private IWord inSynset(ISynset wordSynset, IIndexWord idxNoun) { IWord ret = null; List<IWord> wrds; @@ -105,12 +105,9 @@ public class WordRelationshipDetermination { // } //Returns all the words present in the synset wordSynset - for(IWord synonym : wrds) - { - for(IWordID nounID : idxNoun.getWordIDs()) - { - if(synonym.equals(dictionary.getWord(nounID))) - { + for(IWord synonym : wrds) { + for(IWordID nounID : idxNoun.getWordIDs()) { + if(synonym.equals(dictionary.getWord(nounID))) { ret = synonym; break; } @@ -140,16 +137,15 @@ public class WordRelationshipDetermination { ISynset wordSynset = word.getSynset(); for(Pointer p : rels) { - List<ISynsetID> rels; - if(ww.rels.get(p)!=null) + if (ww.rels.get(p)!=null) rels = ww.rels.get(p); else { rels = wordSynset.getRelatedSynsets(p); ww.rels.put(p, rels); } - for(ISynsetID id: rels) { + for (ISynsetID id: rels) { ISynset s = this.dictionary.getSynset(id); IWord mat = inSynset(s, idxNoun); if(mat!=null) @@ -174,7 +170,7 @@ public class WordRelationshipDetermination { public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) { WordRelation ret = new WordRelation(); ret.relation = WordRelation.NO_RELATION; - for(Word w : l.word) { + for (Word w : l.word) { //Exact match is a string relation. if(w.getLexicon().equalsIgnoreCase(noun)) { ret.relation = WordRelation.STRONG_RELATION; @@ -185,8 +181,7 @@ public class WordRelationshipDetermination { // else it is a Wordnet word and is it a synonym or hyponym of LCs (medium relation) else if(w.getID()!=null && checkMed){ Word wrel = isMediumRel(noun, w) ; - if(wrel!=null) - { + if(wrel!=null) { ret.relation = WordRelation.MED_RELATION; ret.src = w; ret.dest = wrel; @@ -205,20 +200,19 @@ public class WordRelationshipDetermination { e.printStackTrace(); } } + public List<Word> getWordSenses(String noun) { List<Word> ret = new ArrayList<>(); - try{ + try { // openDict(); List<IWordID> wordIDs = this.dictionary.getIndexWord(noun, POS.NOUN).getWordIDs(); - for(IWordID wid: wordIDs) - { + for(IWordID wid: wordIDs) { Word w = new WordnetWord(); w.setLexicon(noun); w.setID(wid); ret.add(w); } - }catch(Exception ex){ - // ex.printStackTrace(); + } catch(Exception ex){ //Not in dictionary Word w = new WordnetWord(); w.setLexicon(noun); diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java index f4e1a0e..e491aec 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java @@ -208,7 +208,7 @@ public class DefaultDocProcessor implements DocProcessor { @Override public String[] getWords(String sent) { - return sent.split(" "); + return sent.trim().split("\\s+"); } @Override diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java index b6072eb..f4b5470 100755 --- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java +++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java @@ -75,8 +75,8 @@ public class TextRank { // words.. public double getWeightedSimilarity(String sent1, String sent2, Hashtable<String, Double> wrdWts) { - String[] words1 = sent1.split(" "); - String[] words2 = sent2.split(" "); + String[] words1 = sent1.trim().split("\\s+"); + String[] words2 = sent2.trim().split("\\s+"); double wordsInCommon = 0; Hashtable<String, Boolean> dups = new Hashtable<>(); for (String s : words1) { @@ -173,8 +173,7 @@ public class TextRank { for (int i = 0; i < sentences.size(); i++) { String nextSent = sentences.get(i); - String[] words = nextSent.split(" "); - List<Integer> processed = new ArrayList<>(); + String[] words = nextSent.trim().split("\\s+"); Score s = new Score(); s.setSentId(i); @@ -185,6 +184,7 @@ public class TextRank { if (otherSents == null) continue; + List<Integer> processed = new ArrayList<>(); for (int idx : otherSents) { if (idx != i && !processed.contains(idx)) { double currS = getWeightedSimilarity(sentences.get(i), @@ -233,7 +233,7 @@ public class TextRank { if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) { String sent = getSentences().get(0); - String[] wrds = sent.split(" "); + String[] wrds = sent.trim().split("\\s+"); for (String wrd : wrds) wrdWts.put(wrd, TITLE_WRD_WT); } @@ -278,7 +278,7 @@ public class TextRank { /* * public double getScore(String sent1, String sent2, boolean toPrint) { - * String[] words1 = sent1.split(" "); String[] words2 = sent2.split(" "); + * String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+"); * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) && * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+=
