OPENNLP-622 Fixed issues related to command line.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d1fab8cd Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d1fab8cd Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d1fab8cd Branch: refs/heads/trunk Commit: d1fab8cd4215ddf65ce98ef6aae2bc06720be742 Parents: f588858 Author: William Colen <[email protected]> Authored: Fri Jul 8 19:18:54 2016 +0000 Committer: William Colen <[email protected]> Committed: Fri Jul 8 19:18:54 2016 +0000 ---------------------------------------------------------------------- .../builder/XMLDictionaryToTableParams.java | 11 ++++- .../builder/XMLDictionaryToTableTool.java | 51 ++++++++++++++++++-- .../tagdict/MorfologikPOSTaggerFactory.java | 26 ---------- .../tagdict/POSTaggerFactoryTest.java | 6 ++- 4 files changed, 63 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java index b88cc5d..4ee8cd4 100644 --- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java +++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java @@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder; import java.io.File; +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.params.EncodingParameter; @@ -30,7 +31,15 @@ interface XMLDictionaryToTableParams extends EncodingParameter { @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.") File getInputFile(); - @ParameterDescription(valueName = "out", description = "Tab separated format.") + @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).") File getOutputFile(); + @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)") + @OptionalParameter(defaultValue=",") + String getSeparator(); + + @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].") + @OptionalParameter(defaultValue="prefix") + String getEncoder(); + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java index c87f016..0e7f2d5 100644 --- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java +++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java @@ -23,8 +23,11 @@ import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; +import java.nio.file.Path; import java.util.Iterator; +import java.util.Properties; +import morfologik.stemming.DictionaryMetadata; import opennlp.tools.cmdline.BasicCmdLineTool; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.TerminateToolException; @@ -35,6 +38,8 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool { interface Params extends XMLDictionaryToTableParams { } + private String SEPARATOR; + public String getShortDescription() { return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file"; } @@ -49,6 +54,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool { File dictInFile = params.getInputFile(); File dictOutFile = params.getOutputFile(); Charset encoding = params.getEncoding(); + SEPARATOR = params.getSeparator(); CmdLineUtil.checkInputFile("dictionary input file", dictInFile); CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); @@ -66,17 +72,56 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool { encoding)) { while (iterator.hasNext()) { String word = iterator.next(); - String wordAndLemma = word + "\t\t"; // lemma is empty for (String tag : tagDictionary.getTags(word)) { - writer.write(wordAndLemma + tag); - writer.newLine(); + if(valid(word,tag)) { + String entry = createEntry(word, tag); + writer.write(entry); + writer.newLine(); + } } } writer.close(); + System.out.println("Created dictionary: " + dictOutFile.toPath()); } catch (IOException e) { throw new TerminateToolException(-1, "Error while writing output: " + e.getMessage(), e); } + + Properties info = new Properties(); + info.setProperty("fsa.dict.separator", SEPARATOR); + info.setProperty("fsa.dict.encoding", params.getEncoding().name()); + info.setProperty("fsa.dict.encoder", params.getEncoder()); + + Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath()); + + try { + info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary."); + } catch (IOException e) { + throw new TerminateToolException(-1, "Error while writing metadata output: " + + e.getMessage(), e); + } + System.out.println("Created metadata: " + dictOutFile.toPath()); + + } + + private boolean valid(String word, String tag) { + if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) { + System.out + .println("Warn: invalid entry because contains separator - word: " + + word + " tag: " + tag); + return false; + } + + return true; + } + + private String createEntry(String word, String tag) { + + String entry = "" + SEPARATOR +// base + word + SEPARATOR + + tag; + + return entry; } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java index dcb6554..93d6c61 100644 --- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java +++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java @@ -17,8 +17,6 @@ package opennlp.morfologik.tagdict; -import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile; - import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileNotFoundException; @@ -27,7 +25,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Map; import morfologik.stemming.DictionaryMetadata; @@ -81,29 +78,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory { protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) { super.init(ngramDictionary, null); this.dict = posDictionary; - - // get the dictionary path - String path = System.getProperty("morfologik.dict"); - if (path == null) { - throw new IllegalArgumentException( - "The property fsa.dict is missing! -Dmorfologik.dict=path"); - } - - // now we try to load it... - try { - this.dictData = Files.readAllBytes(Paths.get(path)); - this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path) - .toPath()); - - this.dict = createMorfologikDictionary(dictData, dictInfo); - - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException( - "The file is not a Morfologik dictionary!", e); - } catch (IOException e) { - throw new IllegalArgumentException( - "Could not open the Morfologik dictionary or the .info file", e); - } } @Override http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java index 9233979..7341a02 100644 --- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java +++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java @@ -17,7 +17,7 @@ package opennlp.morfologik.tagdict; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -71,6 +71,8 @@ public class POSTaggerFactoryTest { POSTaggerFactory factory = posModel.getFactory(); assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary); + factory = null; + ByteArrayOutputStream out = new ByteArrayOutputStream(); posModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); @@ -79,6 +81,8 @@ public class POSTaggerFactoryTest { factory = fromSerialized.getFactory(); assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary); + + assertEquals(2, factory.getTagDictionary().getTags("casa").length); } } \ No newline at end of file
