This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch sanitize_some_TODOs_and_unhealthy_code in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 5348212e75d579dc4a95343f6e7840b8c5ee6ede Author: Martin Wiesner <[email protected]> AuthorDate: Fri Feb 24 19:00:29 2023 +0100 sanitize some TODOs and unhealthy code - addresses open TODO in `PredictTest` which is now only display if platform arch is 'aarch64' (aka Apple Silicon) - fixes incomplete resource handling of IO streams - adds Override annotation where useful - extracts regex Patterns instead of inline recompilation of those in loops - removes "dead" (unused) `FileHandler` class which was flawed for several reasons anyway - removes unused ImageIO / AWT bound code from `Utils` class - clears some unused variables/fields - removes some unclear TODO leftovers - fixes some broken indentation along the path --- .../opennlp/tools/coref/mention/DefaultParse.java | 34 +- .../muc/Muc6FullParseCorefSampleStreamFactory.java | 3 +- .../formats/muc/NameFinderCorefEnhancerStream.java | 1 + .../JSMLearnerOnLatticeWithAbduction.java | 21 +- .../tools/similarity/apps/ContentGenerator.java | 3 +- .../similarity/apps/ContentGeneratorSupport.java | 29 +- .../similarity/apps/RelatedSentenceFinder.java | 111 +++--- .../similarity/apps/RelatedSentenceFinderML.java | 4 - .../solr/WordDocBuilderSingleImageSearchCall.java | 68 ++-- .../taxo_builder/TaxoQuerySnapshotMatcher.java | 4 +- .../taxo_builder/TaxonomyExtenderViaMebMining.java | 33 +- .../tools/similarity/apps/utils/FileHandler.java | 373 --------------------- .../opennlp/tools/similarity/apps/utils/Utils.java | 36 +- .../cmdline/disambiguator/DisambiguatorTool.java | 6 +- .../disambiguator/IMSWSDContextGenerator.java | 10 +- .../disambiguator/OSCCWSDContextGenerator.java | 7 +- .../tools/disambiguator/WSDDefaultParameters.java | 4 +- .../java/opennlp/tools/disambiguator/WSDModel.java | 3 +- .../tools/disambiguator/WSDSampleStream.java | 3 +- .../disambiguator/WSDisambiguatorFactory.java | 3 +- .../tools/disambiguator/WSDisambiguatorME.java | 4 +- .../java/opennlp/tools/disambiguator/WordPOS.java | 4 +- .../main/java/org/apache/opennlp/ModelUtil.java | 22 +- .../org/apache/opennlp/namefinder/PredictTest.java | 16 +- 24 files changed, 178 insertions(+), 624 deletions(-) diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java index 725a213..d3566e1 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/mention/DefaultParse.java @@ -23,7 +23,6 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; -import java.util.Stack; import opennlp.tools.parser.Parse; import opennlp.tools.parser.chunking.Parser; @@ -56,10 +55,12 @@ public class DefaultParse extends AbstractParse { // Should we just maintain a parse id map !? } + @Override public int getSentenceNumber() { return sentenceNumber; } + @Override public List<opennlp.tools.coref.mention.Parse> getNamedEntities() { List<Parse> names = new ArrayList<>(); List<Parse> kids = new LinkedList<>(Arrays.asList(parse.getChildren())); @@ -75,10 +76,12 @@ public class DefaultParse extends AbstractParse { return createParses(names.toArray(new Parse[names.size()])); } + @Override public List<opennlp.tools.coref.mention.Parse> getChildren() { return createParses(parse.getChildren()); } + @Override public List<opennlp.tools.coref.mention.Parse> getSyntacticChildren() { List<Parse> kids = new ArrayList<>(Arrays.asList(parse.getChildren())); for (int ci = 0; ci < kids.size(); ci++) { @@ -92,6 +95,7 @@ public class DefaultParse extends AbstractParse { return createParses(kids.toArray(new Parse[kids.size()])); } + @Override public List<opennlp.tools.coref.mention.Parse> getTokens() { List<Parse> tokens = new ArrayList<>(); List<Parse> kids = new LinkedList<>(Arrays.asList(parse.getChildren())); @@ -107,6 +111,7 @@ public class DefaultParse extends AbstractParse { return createParses(tokens.toArray(new Parse[tokens.size()])); } + @Override public String getSyntacticType() { if (ENTITY_SET.contains(parse.getType())) { return null; @@ -129,6 +134,7 @@ public class DefaultParse extends AbstractParse { return newParses; } + @Override public String getEntityType() { if (ENTITY_SET.contains(parse.getType())) { return parse.getType(); @@ -138,6 +144,7 @@ public class DefaultParse extends AbstractParse { } } + @Override public boolean isParentNAC() { Parse parent = parse.getParent(); while (parent != null) { @@ -149,6 +156,7 @@ public class DefaultParse extends AbstractParse { return false; } + @Override public opennlp.tools.coref.mention.Parse getParent() { Parse parent = parse.getParent(); if (parent == null) { @@ -159,32 +167,32 @@ public class DefaultParse extends AbstractParse { } } + @Override public boolean isNamedEntity() { // TODO: We should use here a special tag to, where // the type can be extracted from. Then it just depends // on the training data and not the values inside NAME_TYPES. - - if (ENTITY_SET.contains(parse.getType())) { - return true; - } - else { - return false; - } + + return ENTITY_SET.contains(parse.getType()); } + @Override public boolean isNounPhrase() { return parse.getType().equals("NP") || parse.getType().startsWith("NP#"); } + @Override public boolean isSentence() { return parse.getType().equals(Parser.TOP_NODE); } + @Override public boolean isToken() { return parse.isPosTag(); } + @Override public int getEntityId() { String type = parse.getType(); @@ -198,16 +206,17 @@ public class DefaultParse extends AbstractParse { } } + @Override public Span getSpan() { return parse.getSpan(); } + @Override public int compareTo(opennlp.tools.coref.mention.Parse p) { if (p == this) { return 0; } - if (getSentenceNumber() < p.getSentenceNumber()) { return -1; } @@ -221,11 +230,6 @@ public class DefaultParse extends AbstractParse { System.out.println("Maybe incorrect measurement!"); - Stack<Parse> parents = new Stack<>(); - - - - // get parent and update distance // if match return distance // if not match do it again @@ -241,6 +245,7 @@ public class DefaultParse extends AbstractParse { } + @Override public opennlp.tools.coref.mention.Parse getPreviousToken() { Parse parent = parse.getParent(); Parse node = parse; @@ -267,6 +272,7 @@ public class DefaultParse extends AbstractParse { } } + @Override public opennlp.tools.coref.mention.Parse getNextToken() { Parse parent = parse.getParent(); Parse node = parse; diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java index d715871..9f5a9d0 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java +++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java @@ -73,6 +73,7 @@ public class Muc6FullParseCorefSampleStreamFactory extends AbstractSampleStreamF super(Parameters.class); } + @Override public ObjectStream<CorefSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); @@ -85,7 +86,7 @@ public class Muc6FullParseCorefSampleStreamFactory extends AbstractSampleStreamF ObjectStream<String> mucDocStream = new FileToStringSampleStream( new DirectorySampleStream(params.getData(), new FileFilter() { - + @Override public boolean accept(File file) { return file.getName().toLowerCase().endsWith(".sgm"); } diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java index e9e0bc4..4e24777 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java +++ b/opennlp-coref/src/main/java/opennlp/tools/formats/muc/NameFinderCorefEnhancerStream.java @@ -43,6 +43,7 @@ public class NameFinderCorefEnhancerStream extends FilterObjectStream<RawCorefSa this.tags = tags; } + @Override public RawCorefSample read() throws IOException { RawCorefSample sample = samples.read(); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithAbduction.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithAbduction.java index 76ae8ed..cd7c818 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithAbduction.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithAbduction.java @@ -20,25 +20,17 @@ package opennlp.tools.jsmlearning; import java.util.Arrays; import java.util.List; -public class JSMLearnerOnLatticeWithAbduction extends JSMLearnerOnLatticeWithDeduction{ - - - +public class JSMLearnerOnLatticeWithAbduction extends JSMLearnerOnLatticeWithDeduction { + @Override public JSMDecision buildLearningModel(List<String> posTexts, List<String> negTexts, String unknown, String[] separationKeywords){ - JSMDecision decision = super.buildLearningModel(posTexts, negTexts, unknown, separationKeywords); - // verify each hypothesis - //TODO - return decision; - + //TODO verify each hypothesis + return super.buildLearningModel(posTexts, negTexts, unknown, separationKeywords); } - - public static void main (String[] args) { - String[] posArr = new String[] {"I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income. ", "To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue. ", "To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent. ", @@ -60,10 +52,5 @@ public class JSMLearnerOnLatticeWithAbduction extends JSMLearnerOnLatticeWithDed // Finally, do prediction JSMDecision dec = // may be determined by ... jsm.buildLearningModel(Arrays.asList(posArr), Arrays.asList(negArr), unknown , new String[]{"property"}); - - - - - } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java index b71d0b2..00a6d33 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java @@ -93,8 +93,7 @@ public class ContentGenerator /*extends RelatedSentenceFinder*/ { for (HitBase item : searchResult) { // got some text from .html if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf - opinionSentencesToAdd - .add(buildParagraphOfGeneratedText(item, sentence, null)); + opinionSentencesToAdd.add(buildParagraphOfGeneratedText(item, sentence, null)); } } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java index 4389ab6..0575bbd 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.logging.Logger; +import java.util.regex.Pattern; import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; import opennlp.tools.similarity.apps.utils.Utils; @@ -30,15 +31,16 @@ import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcess import org.apache.commons.lang.StringUtils; -/* - * This class supports content generation by static functions - * +/** + * This class supports content generation by static functions. */ - public class ContentGeneratorSupport { private static final Logger LOG = Logger .getLogger("opennlp.tools.similarity.apps.ContentGeneratorSupport"); + //TODO - verify regexp!! + private static final Pattern SPACES_PATTERN = Pattern.compile("([a-z])(\\s{2,3})([A-Z])"); + /** * Takes a sentence and extracts noun phrases and entity names to from search * queries for finding relevant sentences on the web, which are then subject @@ -50,10 +52,7 @@ public class ContentGeneratorSupport { * @return List<String> of search expressions */ public static List<String> buildSearchEngineQueryFromSentence(String sentence) { - ParseTreeChunk matcher = new ParseTreeChunk(); - ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor - .getInstance(); - List<List<ParseTreeChunk>> sent1GrpLst = null; + ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance(); List<ParseTreeChunk> nPhrases = pos .formGroupedPhrasesFromChunksForSentence(sentence).get(0); @@ -135,10 +134,11 @@ public class ContentGeneratorSupport { public static String cleanSpacesInCleanedHTMLpage(String pageContent){ //was 4 spaces //was 3 spaces => now back to 2 - //TODO - verify regexp!! - pageContent = pageContent.trim().replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3") - .replace("..", ".").replace(". . .", " "). - replace(". .",". ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so + pageContent = pageContent.trim(); + pageContent = SPACES_PATTERN.matcher(pageContent).replaceAll("$1. $3") + .replace("..", ".").replace(". . .", " ") + .replace(". .",". ").trim(); + // sometimes html breaks are converted into ' ' (two spaces), so // we need to put '.' return pageContent; } @@ -209,12 +209,11 @@ public class ContentGeneratorSupport { for (Fragment f2 : fragmList2) { String sf1 = f1.getResultText(); String sf2 = f2.getResultText(); - if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1)) + if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf2)) continue; if (meas.measureStringDistance(sf1, sf2) > dupeThresh) { fragmList2Results.remove(f2); - LOG.info("Removed duplicates from formed fragments list: " - + sf2); + LOG.info("Removed duplicates from formed fragments list: " + sf2); } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java index 45bcbdb..80f02ed 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java @@ -311,8 +311,7 @@ public class RelatedSentenceFinder { * @param hits List<HitBase> of search results objects * @return List<String> of search results objects where dupes are removed */ - public static List<HitBase> removeDuplicatesFromResultantHits( - List<HitBase> hits) { + public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits) { StringDistanceMeasurer meas = new StringDistanceMeasurer(); double dupeThresh = // 0.8; // if more similar, then considered dupes was 0.7; @@ -447,7 +446,7 @@ public class RelatedSentenceFinder { // or get original snippet pageSentence = fragment; if (pageSentence != null) - pageSentence.replace("_should_find_orig_", ""); + pageSentence = pageSentence.replace("_should_find_orig_", ""); // resultant sentence SHOULD NOT be longer than for times the size of // snippet fragment @@ -463,9 +462,7 @@ public class RelatedSentenceFinder { + " " + title, originalSentence); List<List<ParseTreeChunk>> match = matchRes.getMatchResult(); if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) { - System.out - .println("Rejected Sentence : No verb OR Yes imperative verb :" - + pageSentence); + System.out.println("Rejected Sentence : No verb OR Yes imperative verb :" + pageSentence); continue; } @@ -520,12 +517,9 @@ public class RelatedSentenceFinder { + "| with title= " + title); System.out.println("For fragment = " + fragment); } else - System.out - .println("Rejected sentence due to wrong area at webpage: " - + pageSentence); + System.out.println("Rejected sentence due to wrong area at webpage: " + pageSentence); } else - System.out.println("Rejected sentence due to low score: " - + pageSentence); + System.out.println("Rejected sentence due to low score: " + pageSentence); // } } catch (Throwable t) { t.printStackTrace(); @@ -902,63 +896,58 @@ public class RelatedSentenceFinder { t.printStackTrace(); } - return result; -} + return result; + } -public HitBase buildParagraphOfGeneratedText(HitBase item, - String originalSentence, List<String> sentsAll) { - List<Fragment> results = new ArrayList<>() ; - - Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll); + public HitBase buildParagraphOfGeneratedText(HitBase item, + String originalSentence, List<String> sentsAll) { + List<Fragment> results = new ArrayList<>() ; - List<String> allFragms = fragmentExtractionResults.getFirst(); + Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll); - for (String fragment : allFragms) { - String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults); - if (candidateSentences == null) - continue; - Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll); - if (res!=null) - results.add(res); + List<String> allFragms = fragmentExtractionResults.getFirst(); + for (String fragment : allFragms) { + String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults); + if (candidateSentences == null) + continue; + Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll); + if (res!=null) + results.add(res); + + } + item.setFragments(results); + return item; } - - item.setFragments(results ); - return item; -} - - - - -public static void main(String[] args) { - RelatedSentenceFinder f = new RelatedSentenceFinder(); - - List<HitBase> hits; - try { - // uncomment the sentence you would like to serve as a seed sentence for - // content generation for an event description - - // uncomment the sentence you would like to serve as a seed sentence for - // content generation for an event description - hits = f.generateContentAbout("Albert Einstein" - // "Britney Spears - The Femme Fatale Tour" - // "Rush Time Machine", - // "Blue Man Group" , - // "Belly Dance With Zaharah", - // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer", - // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis", - ); - System.out.println(HitBase.toString(hits)); - System.out.println(HitBase.toResultantString(hits)); - // WordFileGenerator.createWordDoc("Essey about Albert Einstein", - // hits.get(0).getTitle(), hits); - - } catch (Exception e) { - e.printStackTrace(); - } -} + public static void main(String[] args) { + RelatedSentenceFinder f = new RelatedSentenceFinder(); + + List<HitBase> hits; + try { + // uncomment the sentence you would like to serve as a seed sentence for + // content generation for an event description + + // uncomment the sentence you would like to serve as a seed sentence for + // content generation for an event description + hits = f.generateContentAbout("Albert Einstein" + // "Britney Spears - The Femme Fatale Tour" + // "Rush Time Machine", + // "Blue Man Group" , + // "Belly Dance With Zaharah", + // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer", + // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis", + ); + System.out.println(HitBase.toString(hits)); + System.out.println(HitBase.toResultantString(hits)); + // WordFileGenerator.createWordDoc("Essey about Albert Einstein", + // hits.get(0).getTitle(), hits); + + } catch (Exception e) { + e.printStackTrace(); + } + } } \ No newline at end of file diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java index a075bc2..dbc93f5 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java @@ -20,7 +20,6 @@ package opennlp.tools.similarity.apps; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.logging.Logger; import opennlp.tools.similarity.apps.utils.Utils; import opennlp.tools.textsimilarity.TextProcessor; @@ -29,8 +28,6 @@ import opennlp.tools.textsimilarity.TextProcessor; * This class does content generation in ES, DE etc */ public class RelatedSentenceFinderML extends RelatedSentenceFinder{ - private static final Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinderML"); - public RelatedSentenceFinderML(int ms, int msr, float thresh, String key) { this.MAX_STEPS = ms; @@ -45,7 +42,6 @@ public class RelatedSentenceFinderML extends RelatedSentenceFinder{ public List<HitBase> generateContentAbout(String sentence) throws Exception { List<HitBase> opinionSentencesToAdd = new ArrayList<>(); System.out.println(" \n=== Entity to write about = " + sentence); - List<String> nounPhraseQueries = new ArrayList<>(); List<HitBase> searchResult = yrunner.runSearch(sentence, 100); if (MAX_SEARCH_RESULTS<searchResult.size()) diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java index 79aa5d1..b0eaa29 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java @@ -16,7 +16,6 @@ */ package opennlp.tools.similarity.apps.solr; - import java.io.File; import java.util.ArrayList; import java.util.List; @@ -25,13 +24,15 @@ import net.billylieurance.azuresearch.AzureSearchImageResult; import net.billylieurance.azuresearch.AzureSearchResultSet; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; +import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart; import opennlp.tools.similarity.apps.ContentGeneratorSupport; import opennlp.tools.similarity.apps.Fragment; import opennlp.tools.similarity.apps.HitBase; public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ - + + @Override public String buildWordDoc(List<HitBase> content, String title){ String outputDocFilename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx"; @@ -41,20 +42,20 @@ public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ int count=0; try { wordMLPackage = WordprocessingMLPackage.createPackage(); - wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase()); + MainDocumentPart mdp = wordMLPackage.getMainDocumentPart(); + mdp.addStyledParagraphOfText("Title", title.toUpperCase()); for(HitBase para: content){ if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit continue; try { if (!para.getTitle().endsWith("..") /*|| StringUtils.isAlphanumeric(para.getTitle())*/){ String sectTitle = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(para.getTitle()); - wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", - sectTitle); + mdp.addStyledParagraphOfText("Subtitle", sectTitle); } String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "") .replace(".,", ".").replace(".\"", "\"").replace(". .", ".") .replace(",.", "."); - wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); + mdp.addParagraphOfText(paraText); try { addImageByImageURLToPackage(count, wordMLPackage, imageURLs); @@ -67,33 +68,22 @@ public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ count++; } // now add URLs - wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES"); + mdp.addStyledParagraphOfText("Subtitle", "REFERENCES"); for(HitBase para: content){ if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit continue; try { - wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", - para.getTitle()); + mdp.addStyledParagraphOfText("Subtitle", para.getTitle()); String paraText = para.getUrl(); - wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); - - + mdp.addParagraphOfText(paraText); + } catch (Exception e) { e.printStackTrace(); } } - wordMLPackage.save(new File(outputDocFilename)); System.out.println("Finished creating docx ="+outputDocFilename); - //TODO pdf export - /* - FOSettings foSettings = Docx4J.createFOSettings(); - foSettings.setWmlPackage(wordMLPackage); - OutputStream os = new java.io.FileOutputStream(outputDocFilename.replace(".docx", ".pdf")); - Docx4J.toFO(foSettings, os, Docx4J.FLAG_NONE); - System.out.println("Finished creating docx's PDF ="+outputDocFilename); - */ } catch (Exception e) { e.printStackTrace(); @@ -102,9 +92,8 @@ public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ return outputDocFilename; } - protected void addImageByImageURLToPackage(int count, - WordprocessingMLPackage wordMLPackage, - List<String> imageURLs) { + protected void addImageByImageURLToPackage(int count, WordprocessingMLPackage wordMLPackage, + List<String> imageURLs) { if (count>imageURLs.size()-1) return; @@ -112,7 +101,7 @@ public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ String destinationFile = url.replace("http://", "").replace("/", "_"); saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile); File file = new File(absPath+IMG_REL_PATH+destinationFile); - try { + try { byte[] bytes = convertImageToByteArray(file); addImageToPackage(wordMLPackage, bytes); } catch (Exception e) { @@ -130,20 +119,19 @@ public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ return imageURLs; } - - public static void main(String[] args){ - WordDocBuilderSingleImageSearchCall b = new WordDocBuilderSingleImageSearchCall(); - List<HitBase> content = new ArrayList<>(); - for(int i = 0; i<10; i++){ - HitBase h = new HitBase(); - h.setTitle("albert einstein "+i); - List<Fragment> frs = new ArrayList<>(); - frs.add(new Fragment(" content "+i, 0)); - h.setFragments(frs); - content.add(h); - } - - b.buildWordDoc(content, "albert einstein"); - } + public static void main(String[] args){ + WordDocBuilderSingleImageSearchCall b = new WordDocBuilderSingleImageSearchCall(); + List<HitBase> content = new ArrayList<>(); + for(int i = 0; i<10; i++){ + HitBase h = new HitBase(); + h.setTitle("albert einstein "+i); + List<Fragment> frs = new ArrayList<>(); + frs.add(new Fragment(" content "+i, 0)); + h.setFragments(frs); + content.add(h); + } + + b.buildWordDoc(content, "albert einstein"); + } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java index 1be923e..fa205d7 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java @@ -19,7 +19,6 @@ package opennlp.tools.similarity.apps.taxo_builder; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; import java.util.List; import java.util.Map; @@ -55,8 +54,7 @@ public class TaxoQuerySnapshotMatcher { */ public int getTaxoScore(String query, String snapshot) { - lemma_ExtendedAssocWords = (HashMap<String, List<List<String>>>) taxo - .getLemma_ExtendedAssocWords(); + lemma_ExtendedAssocWords = taxo.getLemma_ExtendedAssocWords(); query = query.toLowerCase(); snapshot = snapshot.toLowerCase(); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java index 2f53a7d..e780330 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java @@ -27,7 +27,6 @@ import opennlp.tools.similarity.apps.HitBase; import opennlp.tools.similarity.apps.utils.StringCleaner; import opennlp.tools.stemmer.PStemmer; import opennlp.tools.textsimilarity.ParseTreeChunk; -import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; import opennlp.tools.textsimilarity.SentencePairMatchResult; import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; @@ -39,11 +38,9 @@ import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcess * derived list output map 2) for such manual list of words -> derived list of * words */ - public class TaxonomyExtenderViaMebMining extends BingQueryRunner { - private final ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); - ParserChunker2MatcherProcessor sm; + private ParserChunker2MatcherProcessor sm; private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<>(); private final Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<>(); @@ -145,19 +142,17 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner { List<HitBase> resultList = runSearch(query, numbOfHits); for (int i = 0; i < resultList.size(); i++) { - { - for (int j = i + 1; j < resultList.size(); j++) { - HitBase h1 = resultList.get(i); - HitBase h2 = resultList.get(j); - String snapshot1 = StringCleaner.processSnapshotForMatching(h1 - .getTitle() + " . " + h1.getAbstractText()); - String snapshot2 = StringCleaner.processSnapshotForMatching(h2 - .getTitle() + " . " + h2.getAbstractText()); - SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1, - snapshot2); - List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult(); - genResult.addAll(matchResult); - } + for (int j = i + 1; j < resultList.size(); j++) { + HitBase h1 = resultList.get(i); + HitBase h2 = resultList.get(j); + String snapshot1 = StringCleaner.processSnapshotForMatching(h1 + .getTitle() + " . " + h1.getAbstractText()); + String snapshot2 = StringCleaner.processSnapshotForMatching(h2 + .getTitle() + " . " + h2.getAbstractText()); + SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1, + snapshot2); + List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult(); + genResult.addAll(matchResult); } } @@ -175,9 +170,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner { public static void main(String[] args) { TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining(); - self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax", - "en"); - + self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax", "en"); } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java deleted file mode 100644 index 21bdafb..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/FileHandler.java +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.similarity.apps.utils; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.ByteArrayOutputStream; -import java.io.EOFException; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.logging.Logger; - -/** - * This class responsible to save data to files as well as read out! It is - * capable to handle text and binary files. - */ -public class FileHandler { - - private static final Logger LOG = Logger - .getLogger("opennlp.tools.similarity.apps.utils.FileHandler"); - - public void writeToTextFile(String data, String filepath, boolean append) - throws IOException { - try { - BufferedWriter out = new BufferedWriter(new FileWriter(filepath, append)); - out.write(data + "\n"); - out.close(); - } catch (IOException e) { - LOG.severe(e.toString()); - e.printStackTrace(); - } - } - - /** - * Writes data from an arrayList<String> to a text-file where each line of the - * text represented by an element in the list. - * - * @param list - * @param filePath - * @param append - * @throws Exception - */ - public void writeToTextFile(ArrayList<String> list, String filePath, boolean append) throws Exception { - FileWriter outFile; - Iterator<String> it = list.iterator(); - if (!append) { - outFile = new FileWriter(filePath); - PrintWriter out = new PrintWriter(outFile); - while (it.hasNext()) { - out.println(it.next()); - } - outFile.close(); - } else { - int tmp = 0; - while (it.hasNext()) { - if (tmp == 0) { - appendtofile("\n" + it.next(), filePath); - } else { - appendtofile(it.next(), filePath); - } - tmp++; - } - } - } - - public void writeObjectToFile(Object obj, String filepath, boolean append) { - if (!isFileOrDirectoryExists(getDirPathfromFullPath(filepath))) { - createFolder(getDirPathfromFullPath(filepath)); - } - ObjectOutputStream outputStream; - try { - outputStream = new ObjectOutputStream(new FileOutputStream(filepath)); - outputStream.writeObject(obj); - } catch (IOException e) { - LOG.severe(e.toString()); - } - } - - public Object readObjectfromFile(String filePath) { - ObjectInputStream inputStream = null; - try { - // Construct the ObjectInputStream object - inputStream = new ObjectInputStream(new FileInputStream(filePath)); - Object obj; - while ((obj = inputStream.readObject()) != null) { - return obj; - } - } catch (EOFException ex) { // This exception will be caught when EOF is - // reached - LOG.severe("End of file reached.\n" + ex); - } catch (ClassNotFoundException | IOException ex) { - LOG.severe(ex.toString()); - } finally { - // Close the ObjectInputStream - try { - if (inputStream != null) { - inputStream.close(); - } - } catch (IOException ex) { - LOG.severe(ex.toString()); - } - } - return null; - } - - /** - * Creates a byte array from any object. - * <p> - * I wanted to use it when I write out object to files! (This is not in use - * right now, I may move it into other class) - * - * @param obj - * @return - * @throws java.io.IOException - */ - public byte[] getBytes(Object obj) throws java.io.IOException { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(bos); - oos.writeObject(obj); - oos.flush(); - oos.close(); - bos.close(); - return bos.toByteArray(); - } - - /** - * Fetches all content from a text file, and return it as a String. - * - * @return - */ - public String readFromTextFile(String filePath) { - StringBuilder contents = new StringBuilder(); - // ...checks on aFile are edited - File aFile = new File(filePath); - - try { - // use buffering, reading one line at a time - // FileReader always assumes default encoding is OK! - // TODO be sure that the default encoding is OK!!!!! Otherwise - // change it - - try (BufferedReader input = new BufferedReader(new FileReader(aFile))) { - String line; // not declared within while loop - /* - * readLine is a bit quirky : it returns the content of a line MINUS the - * newline. it returns null only for the END of the stream. it returns - * an empty String if two newlines appear in a row. - */ - while ((line = input.readLine()) != null) { - contents.append(line); - contents.append(System.getProperty("line.separator")); - } - } - } catch (IOException ex) { - LOG.severe("fileName: " + filePath +"\n " + ex); - } - return contents.toString(); - } - - /** - * Reads text file line-wise each line will be an element in the resulting - * list - * - * @param filePath - * @return - */ - public List<String> readLinesFromTextFile(String filePath) { - List<String> lines = new ArrayList<>(); - // ...checks on aFile are edited - File aFile = new File(filePath); - try { - // use buffering, reading one line at a time - // FileReader always assumes default encoding is OK! - // TODO be sure that the default encoding is OK!!!!! Otherwise - // change it - - BufferedReader input = new BufferedReader(new FileReader(aFile)); - try { - String line; // not declared within while loop - /* - * readLine is a bit quirky : it returns the content of a line MINUS the - * newline. it returns null only for the END of the stream. it returns - * an empty String if two newlines appear in a row. - */ - while ((line = input.readLine()) != null) { - lines.add(line); - } - } finally { - input.close(); - } - } catch (IOException ex) { - LOG.severe(ex.toString()); - } - return lines; - } - - private void appendtofile(String data, String filePath) { - try (BufferedWriter out = new BufferedWriter(new FileWriter(filePath, true))) { - out.write(data + "\n"); - } catch (IOException e) { - } - } - - public void createFolder(String path) { - if (!isFileOrDirectoryExists(path)) { - File file = new File(path); - try { - file.mkdirs(); - } catch (Exception e) { - LOG.severe("Directory already exists or the file-system is read only"); - } - } - } - - public boolean isFileOrDirectoryExists(String path) { - File file = new File(path); - return file.exists(); - } - - /** - * Separates the directory-path from a full file-path - * - * @param filePath - * @return - */ - private String getDirPathfromFullPath(String filePath) { - String dirPath = ""; - if (filePath != null) { - if (filePath.contains("\\")) - dirPath = filePath.substring(0, filePath.lastIndexOf("\\")); - } - return dirPath; - } - - /** - * Returns the file-names of the files in a folder (not paths only names) (Not - * recursive) - * - * @param dirPath - * @return - */ - public ArrayList<String> getFileNamesInFolder(String dirPath) { - ArrayList<String> fileNames = new ArrayList<>(); - - File folder = new File(dirPath); - File[] listOfFiles = folder.listFiles(); - - for (File listOfFile : listOfFiles) { - if (listOfFile.isFile()) { - fileNames.add(listOfFile.getName()); - } else if (listOfFile.isDirectory()) { - // TODO if I want to use it recursive I should handle this case - } - } - return fileNames; - } - - public void deleteAllfilesinDir(String dirName) { - ArrayList<String> fileNameList = getFileNamesInFolder(dirName); - if (fileNameList != null) { - for (String s : fileNameList) { - try { - deleteFile(dirName + s); - } catch (IllegalArgumentException e) { - LOG.severe("No way to delete file: " + dirName + s + "\n" + - e); - } - } - } - } - - public void deleteFile(String filePath) throws IllegalArgumentException { - // A File object to represent the filename - File f = new File(filePath); - // Make sure the file or directory exists and isn't write protected - if (!f.exists()) - throw new IllegalArgumentException("Delete: no such file or directory: " - + filePath); - - if (!f.canWrite()) - throw new IllegalArgumentException("Delete: write protected: " + filePath); - // If it is a directory, make sure it is empty - if (f.isDirectory()) { - String[] files = f.list(); - if (files.length > 0) - throw new IllegalArgumentException("Delete: directory not empty: " - + filePath); - } - // Attempt to delete it - boolean success = f.delete(); - if (!success) - throw new IllegalArgumentException("Delete: deletion failed"); - } - - public boolean deleteDirectory(File path) { - if (path.exists()) { - File[] files = path.listFiles(); - for (File file : files) { - if (file.isDirectory()) { - deleteDirectory(file); - } else { - file.delete(); - } - } - } - return (path.delete()); - } - - /** - * Returns the absolute-file-paths of the files in a directory (not recursive) - * - * @param dirPath - * @return - */ - public ArrayList<String> getFilePathsInFolder(String dirPath) { - ArrayList<String> filePaths = new ArrayList<>(); - - File folder = new File(dirPath); - File[] listOfFiles = folder.listFiles(); - if (listOfFiles == null) - return null; - for (File listOfFile : listOfFiles) { - if (listOfFile.isFile()) { - filePaths.add(listOfFile.getAbsolutePath()); - } else if (listOfFile.isDirectory()) { - // TODO if I want to use it recursive I should handle this case - } - } - return filePaths; - } - - /** - * Returns the number of individual files in a directory (Not recursive) - * - * @param dirPath - * @return - */ - public int getFileNumInFolder(String dirPath) { - int num; - try { - num = getFileNamesInFolder(dirPath).size(); - } catch (Exception e) { - num = 0; - } - return num; - } - -} diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java index bae6357..4fd8a17 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Utils.java @@ -17,29 +17,19 @@ package opennlp.tools.similarity.apps.utils; -import java.awt.Graphics2D; -import java.awt.geom.AffineTransform; -import java.awt.image.BufferedImage; -import java.io.File; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; import java.util.List; -import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; -import javax.imageio.ImageIO; - import org.apache.commons.lang.StringUtils; public class Utils { - private static final Logger LOG = Logger - .getLogger("opennlp.tools.similarity.apps.utils.Utils"); - protected static final ArrayList<String[]> CHARACTER_MAPPINGS = new ArrayList<>(); static { @@ -237,30 +227,6 @@ public class Utils { } } - public static boolean createResizedCopy(String originalImage, - String newImage, int scaledWidth, int scaledHeight) { - boolean retVal = true; - try { - File o = new File(originalImage); - BufferedImage bsrc = ImageIO.read(o); - BufferedImage bdest = new BufferedImage(scaledWidth, scaledHeight, - BufferedImage.TYPE_INT_RGB); - - Graphics2D g = bdest.createGraphics(); - AffineTransform at = AffineTransform.getScaleInstance( - (double) scaledWidth / bsrc.getWidth(), - (double) scaledHeight / bsrc.getHeight()); - g.drawRenderedImage(bsrc, at); - ImageIO.write(bdest, "jpeg", new File(newImage)); - - } catch (Exception e) { - retVal = false; - LOG.severe("Failed creating thumbnail for image: " + originalImage + e); - } - - return retVal; - } - private static int minimum(int a, int b, int c) { int mi; @@ -676,7 +642,7 @@ public class Utils { public static boolean isLatinWord(String word) { for (int i = 0; i < word.length(); i++) { - int asciiCode = (int) word.charAt(i); + int asciiCode = word.charAt(i); if (asciiCode > 128) return false; } diff --git a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java index 98f32bd..0a4554f 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java @@ -43,13 +43,11 @@ import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ParagraphStream; import opennlp.tools.util.PlainTextByLineStream; -/* - * Command line tool for disambiguator supports MFS for now - * +/** + * Command line tool for disambiguator supports MFS for now. */ public class DisambiguatorTool extends CmdLineTool { - // TODO CmdLineTool should be an interface not abstract class @Override public String getName() { return "Disambiguator"; diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java index 7cc7015..11d8f9e 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.regex.Pattern; public class IMSWSDContextGenerator implements WSDContextGenerator { @@ -48,13 +49,14 @@ public class IMSWSDContextGenerator implements WSDContextGenerator { // TODO consider the windowSize List<String> contextWords = new ArrayList<>(); + final Pattern pattern = Pattern.compile("[^a-z_]"); + for (int i = 0; i < toks.length; i++) { if (lemmas != null) { - if (!WSDHelper.STOP_WORDS.contains(toks[i].toLowerCase()) && (index - != i)) { + if (!WSDHelper.STOP_WORDS.contains(toks[i].toLowerCase()) && (index != i)) { - String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "") - .trim(); + String lemma = lemmas[i].toLowerCase(); + lemma = pattern.matcher(lemma).replaceAll("").trim(); if (lemma.length() > 1) { contextWords.add(lemma); diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java index 8c52c9d..6e13523 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.regex.Pattern; import net.sf.extjwnl.data.Synset; @@ -38,14 +39,16 @@ public class OSCCWSDContextGenerator implements WSDContextGenerator { // TODO consider windowSize ArrayList<String> contextClusters = new ArrayList<>(); + final Pattern pattern = Pattern.compile("[^a-z_]"); + for (int i = 0; i < toks.length; i++) { if (lemmas != null) { if (!WSDHelper.STOP_WORDS.contains(toks[i].toLowerCase()) && (index != i)) { - String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "") - .trim(); + String lemma = lemmas[i].toLowerCase(); + lemma = pattern.matcher(lemma).replaceAll("").trim(); WordPOS word = new WordPOS(lemma, tags[i]); diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java index e65bccb..446b46c 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java @@ -98,8 +98,8 @@ public class WSDDefaultParameters extends WSDParameters { this.trainingDataDirectory = trainingDataDirectory; } - @Override public boolean areValid() { - // TODO recheck this pattern + @Override + public boolean areValid() { return true; } diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java index a51b656..90afbbf 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java @@ -126,7 +126,8 @@ public class WSDModel extends BaseModel { return true; } - @Override protected void validateArtifactMap() throws InvalidFormatException { + @Override + protected void validateArtifactMap() throws InvalidFormatException { super.validateArtifactMap(); if (!(artifactMap.get(WSD_MODEL_ENTRY_NAME) instanceof AbstractModel)) { diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java index d8667d2..fc060f3 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java @@ -61,8 +61,7 @@ public class WSDSampleStream extends FilterObjectStream<String, WSDSample> { } catch (InvalidFormatException e) { if (LOG.isLoggable(Level.WARNING)) { - LOG - .warning("Error during parsing, ignoring sentence: " + sentence); + LOG.warning("Error during parsing, ignoring sentence: " + sentence); } sample = null; // new WSDSample(new String[]{}, new String[]{},0); diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorFactory.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorFactory.java index b222f52..f75d9b7 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorFactory.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorFactory.java @@ -50,7 +50,8 @@ public class WSDisambiguatorFactory extends BaseToolFactory { } } - @Override public void validateArtifactMap() throws InvalidFormatException { + @Override + public void validateArtifactMap() throws InvalidFormatException { // no additional artifacts } diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java index c8aa549..b70bd42 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java @@ -115,7 +115,8 @@ public class WSDisambiguatorME extends WSDisambiguator { return surroundingWordsModel; } - @Override public String disambiguate(WSDSample sample) { + @Override + public String disambiguate(WSDSample sample) { if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) { String wordTag = sample.getTargetWordTag(); @@ -202,6 +203,7 @@ public class WSDisambiguatorME extends WSDisambiguator { * @param index : the index of the word to disambiguate * @return an array of the senses of the word to disambiguate */ + @Override public String disambiguate(String[] tokenizedContext, String[] tokenTags, String[] lemmas, int index) { return disambiguate( diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java index 7ada773..5a2ff78 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java @@ -36,7 +36,6 @@ public class WordPOS { private List<String> stems; private final POS pos; private String posTag; - public boolean isTarget = false; public WordPOS(String word, String tag) throws IllegalArgumentException { if (word == null || tag == null) { @@ -82,8 +81,7 @@ public class WordPOS { try { indexWord = WSDHelper.getDictionary().lookupIndexWord(pos, word); if (indexWord == null) { - WSDHelper - .print("NULL synset probably a POS tagger mistake ! :: [POS] : " + WSDHelper.print("NULL synset probably a POS tagger mistake ! :: [POS] : " + pos.getLabel() + " [word] : " + word); return null; } diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java b/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java index 76e5c8a..1f5b2d2 100644 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java +++ b/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java @@ -29,21 +29,21 @@ public class ModelUtil { public static Path writeModelToTmpDir(InputStream modelIn) throws IOException { Path tmpDir = Files.createTempDirectory("opennlp2"); - ZipInputStream zis = new ZipInputStream(modelIn); - ZipEntry zipEntry = zis.getNextEntry(); - while(zipEntry != null){ - Path newFile = tmpDir.resolve(zipEntry.getName()); + try (ZipInputStream zis = new ZipInputStream(modelIn)) { + ZipEntry zipEntry = zis.getNextEntry(); + while(zipEntry != null){ + Path newFile = tmpDir.resolve(zipEntry.getName()); - Files.createDirectories(newFile.getParent()); - Files.copy(zis, newFile); + Files.createDirectories(newFile.getParent()); + Files.copy(zis, newFile); - // TODO: How to delete the tmp directory after we are done loading from it ?! - newFile.toFile().deleteOnExit(); + // TODO: How to delete the tmp directory after we are done loading from it ?! + newFile.toFile().deleteOnExit(); - zipEntry = zis.getNextEntry(); + zipEntry = zis.getNextEntry(); + } + zis.closeEntry(); } - zis.closeEntry(); - zis.close(); return tmpDir; } diff --git a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java index 4c7b906..0bdae56 100644 --- a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java +++ b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java @@ -19,20 +19,20 @@ package org.apache.opennlp.namefinder; -import org.junit.jupiter.api.Disabled; +import java.io.IOException; +import java.nio.file.Path; + import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledIfSystemProperty; import opennlp.tools.util.Span; -import java.io.IOException; -import java.nio.file.Path; - class PredictTest { - @Test - @Disabled // TODO This test is not platform neutral and, for instance, fails with - // "Cannot find TensorFlow native library for OS: darwin, architecture: aarch64" - // We need JUnit 5 in the sandbox to circumvent this, so it can be run in supported environments + // Note: Atm, this test is not platform neutral and, for instance, fails with + // "Cannot find TensorFlow native library for OS: darwin, architecture: aarch64" + // That's why it is disabled via the architecture system property. + @Test @DisabledIfSystemProperty(named = "os.arch", matches = "aarch64") void testFindTokens() throws IOException { // can be changed to File or InputStream
