[ https://issues.apache.org/jira/browse/OPENNLP-1594?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17866133#comment-17866133 ]
ASF GitHub Bot commented on OPENNLP-1594: ----------------------------------------- rzo1 commented on code in PR #158: URL: https://github.com/apache/opennlp-sandbox/pull/158#discussion_r1678162623 ########## summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java: ########## @@ -97,13 +125,17 @@ public double getWeightedSimilarity(String sent1, String sent2, return (wordsInCommon) / (words1.length + words2.length); } - // Gets the current score from the list of scores passed ... + /** + * @param scores A list of {@link Score} instances. + * @param id The sentence id to check for. + * @return Gets the element from {@code scores} that matches the passed sentence {@code id}. + */ public double getScoreFrom(List<Score> scores, int id) { for (Score s : scores) { if (s.getSentId() == id) return s.getScore(); } - return 1; + return 1; // Why is the default score "1" here? Review Comment: :) ########## summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java: ########## @@ -44,95 +48,128 @@ public class LexicalChainingSummarizer implements Summarizer { private final DocProcessor docProcessor; private final WordRelationshipDetermination wordRel; - public LexicalChainingSummarizer(DocProcessor dp, OpenNLPPOSTagger posTagger) { - docProcessor = dp; - tagger = posTagger; - wordRel = new WordRelationshipDetermination(); + /** + * Instantiates a {@link LexicalChainingSummarizer}. + * + * @param docProcessor The {@link DocProcessor} to use at runtime. Must not be {@code null}. + * @param languageCode An ISO-language code for obtaining a {@link POSModel}. + * Must not be {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public LexicalChainingSummarizer(DocProcessor docProcessor, String languageCode) throws IOException { + this(docProcessor, new NounPOSTagger(languageCode)); } - public LexicalChainingSummarizer(DocProcessor dp, InputStream posModelFile) throws Exception { - this(dp, new OpenNLPPOSTagger(dp, posModelFile)); + /** + * Instantiates a {@link LexicalChainingSummarizer}. + * + * @param docProcessor The {@link DocProcessor} to use at runtime. Must not be {@code null}. + * @param posTagger The {@link NounPOSTagger} to use at runtime. Must not be {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public LexicalChainingSummarizer(DocProcessor docProcessor, NounPOSTagger posTagger) { + if (docProcessor == null) throw new IllegalArgumentException("Parameter 'docProcessor' must not be null!"); + if (posTagger == null) throw new IllegalArgumentException("Parameter 'posTagger' must not be null!"); + + this.docProcessor = docProcessor; + tagger = posTagger; + wordRel = new WordRelationshipDetermination(); } - //Build Lexical chains.. - public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent) { - // POS tag article - Hashtable<String, List<LexicalChain>> chains = new Hashtable<>(); - List<LexicalChain> lc = new ArrayList<>(); - // Build lexical chains - // For each sentence - for (Sentence currSent : sent) { - String taggedSent = tagger.getTaggedString(currSent.getStringVal()); - List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN); - // For each noun - for (String noun : nouns) { - int chainsAddCnt = 0; - // Loop through each LC - for (LexicalChain l : lc) { - try { - WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start) > 7); - // Is the noun an exact match to one of the current LCs (Strong relation) - // Add sentence to chain - if (rel.relation() == WordRelation.STRONG_RELATION) { - addToChain(rel.dest(), l, chains, currSent); - if (currSent.getSentId() - l.last > 10) { - l.occurrences++; - l.start = currSent.getSentId(); - } - chainsAddCnt++; - } else if (rel.relation() == WordRelation.MED_RELATION) { - // Add sentence to chain if it is 7 sentences away from start of chain - addToChain(rel.dest(), l, chains, currSent); - chainsAddCnt++; - //If greater than 7 we will add it but call it a new occurrence of the lexical chain... - if (currSent.getSentId() - l.start > 7) { - l.occurrences++; - l.start = currSent.getSentId(); - } - } else if (rel.relation() == WordRelation.WEAK_RELATION) { - if (currSent.getSentId() - l.start <= 3) { + /** + * Constructs a list of {@link LexicalChain lexical chains} from specified sentences. + * + * @param sentences The list of {@link Sentence sentences} to build lexical chains from. + * Must not be {@code null}. + * @return The result list of {@link LexicalChain lexical chains}. Guaranteed to be not {@code null}. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public List<LexicalChain> buildLexicalChains(List<Sentence> sentences) { + if (sentences == null) throw new IllegalArgumentException("Parameter 'sentences' must not be null!"); + else { + if (sentences.isEmpty()) { + return Collections.emptyList(); + } + Hashtable<String, List<LexicalChain>> chains = new Hashtable<>(); + List<LexicalChain> lc = new ArrayList<>(); + // Build lexical chains + // For each sentence + for (Sentence currSent : sentences) { + // POS tag article + String taggedSent = tagger.getTaggedString(currSent.getStringVal().replace(".", " .")); + List<String> nouns = tagger.getWordsOfType(docProcessor.getWords(taggedSent), POSTagger.NOUN); + // For each noun + for (String noun : nouns) { + int chainsAddCnt = 0; + // Loop through each LC + for (LexicalChain l : lc) { + try { + WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start) > 7); Review Comment: Maybe we can use speaking constants for `3`, `7` and `10` here because it seems, that they decode the number of sentences from the start of the chain? > Add stricter tests for Summarizer component > ------------------------------------------- > > Key: OPENNLP-1594 > URL: https://issues.apache.org/jira/browse/OPENNLP-1594 > Project: OpenNLP > Issue Type: Test > Affects Versions: 2.3.3 > Reporter: Martin Wiesner > Assignee: Martin Wiesner > Priority: Major > Fix For: 2.4.0 > > > While OPENNLP-1592 and OPENNLP-1593 raised test coverage for the summarizer > component, there are still some "dark spots" and probably inaccuracies in the > implementation of that component. More, stricter tests shall shed light on > the situation. > Aims: > * add further, stricter tests for the summarizer component > * clarify, at API level, the semantics and constraints of parameters > * better separate tests so that each test class has a clear responsibility > for its class under test > * improve / enhance the JavaDoc further -- This message was sent by Atlassian Jira (v8.20.10#820010)