OPENNLP-622 Added code to create Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a TagDictionary implementation using Morfologik. Added a POSTaggerFactory to bundle the Morfologik dictionaries in POS Tagger models.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/78dd579b Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/78dd579b Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/78dd579b Branch: refs/heads/trunk Commit: 78dd579b0e013b3132caae35afe71113764742e9 Parents: f3e9057 Author: William Colen <[email protected]> Authored: Mon Dec 2 13:23:04 2013 +0000 Committer: William Colen <[email protected]> Committed: Mon Dec 2 13:23:04 2013 +0000 ---------------------------------------------------------------------- pom.xml | 19 +- .../builder/MorfologikDictionayBuilder.java | 163 ++++++++++++++++ .../java/opennlp/morfologik/cmdline/CLI.java | 164 +++++++++++++++++ .../MorfologikDictionaryBuilderParams.java | 49 +++++ .../MorfologikDictionaryBuilderTool.java | 71 +++++++ .../builder/XMLDictionaryToTableParams.java | 36 ++++ .../builder/XMLDictionaryToTableTool.java | 82 +++++++++ .../tagdict/MorfologikPOSTaggerFactory.java | 184 +++++++++++++++++++ .../tagdict/MorfologikTagDictionary.java | 90 +++++++++ .../builder/POSDictionayBuilderTest.java | 101 ++++++++++ .../lemmatizer/MorfologikLemmatizerTest.java | 46 +++++ .../tagdict/MorfologikTagDictionaryTest.java | 92 ++++++++++ src/test/resources/dictionaryWithLemma.txt | 10 + 13 files changed, 1101 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 67e1eaa..51854f6 100644 --- a/pom.xml +++ b/pom.xml @@ -33,6 +33,12 @@ <version>1.6.0</version> <scope>compile</scope> </dependency> + <dependency> + <groupId>org.carrot2</groupId> + <artifactId>morfologik-tools</artifactId> + <version>1.6.0</version> + <scope>compile</scope> + </dependency> <dependency> <groupId>org.apache.opennlp</groupId> @@ -40,11 +46,12 @@ <version>1.6.0-SNAPSHOT</version> </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>3.8.1</version> - <scope>test</scope> - </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.8.1</version> + <scope>test</scope> + </dependency> + </dependencies> </project> http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java new file mode 100644 index 0000000..b8bcfbf --- /dev/null +++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.builder; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import morfologik.stemming.Dictionary; +import morfologik.tools.FSABuildTool; +import morfologik.tools.Launcher; + +/** + * Utility class to build Morfologik dictionaries from a tab separated values + * file. The first column is the word, the second its lemma and the third a POS + * tag. If there is no lemma information leave the second column empty. + */ +public class MorfologikDictionayBuilder { + + /** + * Build a Morfologik binary dictionary + * + * @param dictInFile + * the 3 column TSV dictionary file + * @param dictOutFile + * where to store the binary Morfologik dictionary + * @param encoding + * the encoding to be used while reading and writing + * @param separator + * a field separator, the default is '+'. If your tags contains '+' + * change to something else + * @param isUsePrefixes + * if to compact using prefixes + * @param isUseInfixes + * if to compact using infixes + * @throws Exception + */ + public void build(File dictInFile, File dictOutFile, Charset encoding, + String separator, boolean isUsePrefixes, boolean isUseInfixes) + throws Exception { + + File propertiesFile = new File( + Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath())); + this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator, + isUsePrefixes, isUseInfixes); + } + + /** + * Build a Morfologik binary dictionary + * + * @param dictInFile + * the 3 column TSV dictionary file + * @param dictOutFile + * where to store the binary Morfologik dictionary + * @param propertiesOutFile + * where to store the properties of the Morfologik dictionary + * @param encoding + * the encoding to be used while reading and writing + * @param separator + * a field separator, the default is '+'. If your tags contains '+' + * change to something else + * @param isUsePrefixes + * if to compact using prefixes + * @param isUseInfixes + * if to compact using infixes + * @throws Exception + */ + public void build(File dictInFile, File dictOutFile, File propertiesOutFile, + Charset encoding, String separator, boolean isUsePrefixes, + boolean isUseInfixes) throws Exception { + + // we need to execute tab2morph followed by fsa_build + + File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes); + + fsaBuild(morph, dictOutFile); + + morph.delete(); + + // now we create the properties files using the passed parameters + createProperties(encoding, separator, isUsePrefixes, isUseInfixes, + propertiesOutFile); + } + + void createProperties(Charset encoding, String separator, + boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile) + throws FileNotFoundException, IOException { + + Properties properties = new Properties(); + properties.setProperty("fsa.dict.separator", separator); + properties.setProperty("fsa.dict.encoding", encoding.name()); + properties.setProperty("fsa.dict.uses-prefixes", + Boolean.toString(isUsePrefixes)); + properties.setProperty("fsa.dict.uses-infixes", + Boolean.toString(isUseInfixes)); + + OutputStream os = new FileOutputStream(propertiesFile); + properties.store(os, "Morfologik POS Dictionary properties"); + os.close(); + + } + + private void fsaBuild(File morph, File dictOutFile) throws Exception { + String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o", + dictOutFile.getAbsolutePath() }; + FSABuildTool.main(params); + } + + private File tab2morph(File dictInFile, String separator, + boolean isUsePrefixes, boolean isUseInfixes) throws Exception { + + // create tab2morph parameters + List<String> tag2morphParams = new ArrayList<String>(); + tag2morphParams.add("tab2morph"); + + tag2morphParams.add("--annotation"); + tag2morphParams.add(separator); + + if (isUsePrefixes) { + tag2morphParams.add("-pre"); + } + + if (isUseInfixes) { + tag2morphParams.add("-inf"); + } + + tag2morphParams.add("-i"); + tag2morphParams.add(dictInFile.getAbsolutePath()); + + // we need a temporary file to store the intermediate output + File tmp = File.createTempFile("tab2morph", ".txt"); + tmp.deleteOnExit(); + + tag2morphParams.add("-o"); + tag2morphParams.add(tmp.getAbsolutePath()); + + Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()])); + + return tmp; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/CLI.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java new file mode 100644 index 0000000..66a5151 --- /dev/null +++ b/src/main/java/opennlp/morfologik/cmdline/CLI.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool; +import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool; +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CmdLineTool; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.TypedCmdLineTool; +import opennlp.tools.util.Version; + +public final class CLI { + + public static final String CMD = "opennlp-morfologik-addon"; + + private static Map<String, CmdLineTool> toolLookupMap; + + static { + toolLookupMap = new LinkedHashMap<String, CmdLineTool>(); + + List<CmdLineTool> tools = new LinkedList<CmdLineTool>(); + + tools.add(new MorfologikDictionaryBuilderTool()); + tools.add(new XMLDictionaryToTableTool()); + + for (CmdLineTool tool : tools) { + toolLookupMap.put(tool.getName(), tool); + } + + toolLookupMap = Collections.unmodifiableMap(toolLookupMap); + } + + /** + * @return a set which contains all tool names + */ + public static Set<String> getToolNames() { + return toolLookupMap.keySet(); + } + + private static void usage() { + System.out.print("OpenNLP Morfologik Addon " + + Version.currentVersion().toString() + ". "); + System.out.println("Usage: " + CMD + " TOOL"); + System.out.println("where TOOL is one of:"); + + // distance of tool name from line start + int numberOfSpaces = -1; + for (String toolName : toolLookupMap.keySet()) { + if (toolName.length() > numberOfSpaces) { + numberOfSpaces = toolName.length(); + } + } + numberOfSpaces = numberOfSpaces + 4; + + for (CmdLineTool tool : toolLookupMap.values()) { + + System.out.print(" " + tool.getName()); + + for (int i = 0; i < Math.abs(tool.getName().length() + - numberOfSpaces); i++) { + System.out.print(" "); + } + + System.out.println(tool.getShortDescription()); + } + + System.out + .println("All tools print help when invoked with help parameter"); + System.out + .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help"); + } + + public static void main(String[] args) { + + if (args.length == 0) { + usage(); + System.exit(0); + } + + String toolArguments[] = new String[args.length - 1]; + System.arraycopy(args, 1, toolArguments, 0, toolArguments.length); + + String toolName = args[0]; + + // check for format + String formatName = StreamFactoryRegistry.DEFAULT_FORMAT; + int idx = toolName.indexOf("."); + if (-1 < idx) { + formatName = toolName.substring(idx + 1); + toolName = toolName.substring(0, idx); + } + CmdLineTool tool = toolLookupMap.get(toolName); + + try { + if (null == tool) { + throw new TerminateToolException(1, "Tool " + toolName + + " is not found."); + } + + if ((0 == toolArguments.length && tool.hasParams()) + || 0 < toolArguments.length + && "help".equals(toolArguments[0])) { + if (tool instanceof TypedCmdLineTool) { + System.out.println(((TypedCmdLineTool) tool) + .getHelp(formatName)); + } else if (tool instanceof BasicCmdLineTool) { + System.out.println(tool.getHelp()); + } + + System.exit(0); + } + + if (tool instanceof TypedCmdLineTool) { + ((TypedCmdLineTool) tool).run(formatName, toolArguments); + } else if (tool instanceof BasicCmdLineTool) { + if (-1 == idx) { + ((BasicCmdLineTool) tool).run(toolArguments); + } else { + throw new TerminateToolException(1, "Tool " + toolName + + " does not support formats."); + } + } else { + throw new TerminateToolException(1, "Tool " + toolName + + " is not supported."); + } + } catch (TerminateToolException e) { + + if (e.getMessage() != null) { + System.err.println(e.getMessage()); + } + + if (e.getCause() != null) { + System.err.println(e.getCause().getMessage()); + e.getCause().printStackTrace(System.err); + } + + System.exit(e.getCode()); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java new file mode 100644 index 0000000..0b1e896 --- /dev/null +++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.File; + +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.params.EncodingParameter; + +/** + * Params for Dictionary tools. + */ +interface MorfologikDictionaryBuilderParams extends EncodingParameter { + + @ParameterDescription(valueName = "in", description = "Plain file with one entry per line") + File getInputFile(); + + @ParameterDescription(valueName = "out", description = "The generated dictionary file.") + File getOutputFile(); + + @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.") + @OptionalParameter(defaultValue = "+") + String getFSADictSeparator(); + + @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.") + @OptionalParameter(defaultValue = "true") + Boolean getUsesPrefixes(); + + @ParameterDescription(valueName = "true|false", description = "Compact using infixes.") + @OptionalParameter(defaultValue = "true") + Boolean getUsesInfixes(); + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java new file mode 100644 index 0000000..9da7e7d --- /dev/null +++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.File; +import java.nio.charset.Charset; + +import morfologik.stemming.Dictionary; +import opennlp.morfologik.builder.MorfologikDictionayBuilder; +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; + +public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool { + + interface Params extends MorfologikDictionaryBuilderParams { + } + + public String getShortDescription() { + return "builds a binary POS Dictionary using Morfologik"; + } + + public String getHelp() { + return getBasicHelp(Params.class); + } + + public void run(String[] args) { + Params params = validateAndParseParams(args, Params.class); + + File dictInFile = params.getInputFile(); + File dictOutFile = params.getOutputFile(); + File propertiesFile = getExpectedPropertiesFile(dictOutFile); + Charset encoding = params.getEncoding(); + + CmdLineUtil.checkInputFile("dictionary input file", dictInFile); + CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); + CmdLineUtil.checkOutputFile("properties output file", propertiesFile); + + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + try { + builder.build(dictInFile, dictOutFile, propertiesFile, encoding, + params.getFSADictSeparator(), params.getUsesPrefixes(), + params.getUsesInfixes()); + } catch (Exception e) { + throw new TerminateToolException(-1, + "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e); + } + + } + + private File getExpectedPropertiesFile(File dictFile) { + return new File(Dictionary.getExpectedFeaturesName(dictFile + .getAbsolutePath())); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java new file mode 100644 index 0000000..b88cc5d --- /dev/null +++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.File; + +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.params.EncodingParameter; + +/** + * Params for Dictionary tools. + */ +interface XMLDictionaryToTableParams extends EncodingParameter { + + @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.") + File getInputFile(); + + @ParameterDescription(valueName = "out", description = "Tab separated format.") + File getOutputFile(); + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java new file mode 100644 index 0000000..c87f016 --- /dev/null +++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.util.Iterator; + +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.postag.POSDictionary; + +public class XMLDictionaryToTableTool extends BasicCmdLineTool { + + interface Params extends XMLDictionaryToTableParams { + } + + public String getShortDescription() { + return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file"; + } + + public String getHelp() { + return getBasicHelp(Params.class); + } + + public void run(String[] args) { + Params params = validateAndParseParams(args, Params.class); + + File dictInFile = params.getInputFile(); + File dictOutFile = params.getOutputFile(); + Charset encoding = params.getEncoding(); + + CmdLineUtil.checkInputFile("dictionary input file", dictInFile); + CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); + + POSDictionary tagDictionary = null; + try { + tagDictionary = POSDictionary.create(new FileInputStream(dictInFile)); + } catch (IOException e) { + throw new TerminateToolException(-1, + "Error while loading XML POS Dictionay: " + e.getMessage(), e); + } + Iterator<String> iterator = tagDictionary.iterator(); + + try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(), + encoding)) { + while (iterator.hasNext()) { + String word = iterator.next(); + String wordAndLemma = word + "\t\t"; // lemma is empty + for (String tag : tagDictionary.getTags(word)) { + writer.write(wordAndLemma + tag); + writer.newLine(); + } + } + writer.close(); + } catch (IOException e) { + throw new TerminateToolException(-1, "Error while writing output: " + + e.getMessage(), e); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java new file mode 100644 index 0000000..9b74ae5 --- /dev/null +++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.tagdict; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Map; + +import opennlp.tools.dictionary.Dictionary; +import opennlp.tools.postag.POSTaggerFactory; +import opennlp.tools.postag.TagDictionary; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.model.ArtifactSerializer; +import opennlp.tools.util.model.ModelUtil; + +public class MorfologikPOSTaggerFactory extends POSTaggerFactory { + + private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict"; + private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info"; + + private static final String MORFOLOGIK_POSDICT = "tagdict." + + MORFOLOGIK_POSDICT_SUF; + private static final String MORFOLOGIK_DICT_INFO = "tagdict." + + MORFOLOGIK_DICT_INFO_SUF; + + private TagDictionary dict; + + private byte[] dictInfo; + private byte[] dictData; + + public MorfologikPOSTaggerFactory() { + } + + public MorfologikPOSTaggerFactory(Dictionary ngramDictionary, + TagDictionary posDictionary) { + super(ngramDictionary, null); + } + + @Override + protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) { + super.init(ngramDictionary, null); + this.dict = posDictionary; + + // get the dictionary path + String path = System.getProperty("morfologik.dict"); + if (path == null) { + throw new IllegalArgumentException( + "The property fsa.dict is missing! -Dmorfologik.dict=path"); + } + + // now we try to load it... + try { + this.dictData = Files.readAllBytes(Paths.get(path)); + this.dictInfo = Files.readAllBytes(Paths + .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path))); + + this.dict = createMorfologikDictionary(dictData, dictInfo); + + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException( + "The file is not a Morfologik dictionary!", e); + } catch (IOException e) { + throw new IllegalArgumentException( + "Could not open the Morfologik dictionary or the .info file", e); + } + } + + @Override + public TagDictionary getTagDictionary() { + if (this.dict == null) { + + if (artifactProvider != null) { + Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT); + if (obj != null) { + byte[] data = (byte[]) artifactProvider + .getArtifact(MORFOLOGIK_POSDICT); + byte[] info = (byte[]) artifactProvider + .getArtifact(MORFOLOGIK_DICT_INFO); + + try { + this.dict = createMorfologikDictionary(data, info); + } catch (IllegalArgumentException e) { + throw new RuntimeException( + "Could not load the dictionary files to Morfologik.", e); + } catch (IOException e) { + throw new RuntimeException( + "IO error while reading the Morfologik dictionary files.", e); + } + } + } + } + + return this.dict; + } + + @Override + public void setTagDictionary(TagDictionary dictionary) { + throw new UnsupportedOperationException( + "Morfologik POS Tagger factory does not support this operation"); + } + + @Override + public TagDictionary createEmptyTagDictionary() { + throw new UnsupportedOperationException( + "Morfologik POS Tagger factory does not support this operation"); + } + + @Override + public TagDictionary createTagDictionary(File dictionary) + throws InvalidFormatException, FileNotFoundException, IOException { + throw new UnsupportedOperationException( + "Morfologik POS Tagger factory does not support this operation"); + } + + @Override + public TagDictionary createTagDictionary(InputStream in) + throws InvalidFormatException, IOException { + throw new UnsupportedOperationException( + "Morfologik POS Tagger factory does not support this operation"); + } + + @Override + @SuppressWarnings("rawtypes") + public Map<String, ArtifactSerializer> createArtifactSerializersMap() { + Map<String, ArtifactSerializer> serializers = super + .createArtifactSerializersMap(); + + serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer()); + serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer()); + + return serializers; + } + + @Override + public Map<String, Object> createArtifactMap() { + Map<String, Object> artifactMap = super.createArtifactMap(); + artifactMap.put(MORFOLOGIK_POSDICT, this.dictData); + artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo); + return artifactMap; + } + + private TagDictionary createMorfologikDictionary(byte[] data, byte[] info) + throws IOException { + morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary + .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream( + info)); + return new MorfologikTagDictionary(dict); + } + + static class ByteArraySerializer implements ArtifactSerializer<byte[]> { + + public byte[] create(InputStream in) throws IOException, + InvalidFormatException { + + return ModelUtil.read(in); + } + + public void serialize(byte[] artifact, OutputStream out) throws IOException { + out.write(artifact); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java new file mode 100644 index 0000000..b34ca2b --- /dev/null +++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.tagdict; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; +import opennlp.tools.postag.TagDictionary; + +/** + * A POS Tagger dictionary implementation based on Morfologik binary + * dictionaries + */ +public class MorfologikTagDictionary implements TagDictionary { + + private IStemmer dictLookup; + private boolean isCaseSensitive; + + /** + * Creates a case sensitive {@link MorfologikTagDictionary} + * + * @param dict + * a Morfologik FSA dictionary + * @throws IllegalArgumentException + * if FSA's root node cannot be acquired (dictionary is empty). + * @throws IOException + * could not read dictionary from dictURL + */ + public MorfologikTagDictionary(Dictionary dict) + throws IllegalArgumentException, IOException { + this(dict, true); + } + + /** + * Creates MorfologikLemmatizer + * + * @param dict + * a Morfologik FSA dictionary + * @param caseSensitive + * if true it performs case sensitive lookup + * @throws IllegalArgumentException + * if FSA's root node cannot be acquired (dictionary is empty). + * @throws IOException + * could not read dictionary from dictURL + */ + public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive) + throws IllegalArgumentException, IOException { + this.dictLookup = new DictionaryLookup(dict); + this.isCaseSensitive = caseSensitive; + } + + @Override + public String[] getTags(String word) { + if (!isCaseSensitive) { + word = word.toLowerCase(); + } + + List<WordData> data = dictLookup.lookup(word); + if (data != null && data.size() > 0) { + List<String> tags = new ArrayList<String>(data.size()); + for (int i = 0; i < data.size(); i++) { + tags.add(data.get(i).getTag().toString()); + } + if (tags.size() > 0) + return tags.toArray(new String[tags.size()]); + return null; + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java new file mode 100644 index 0000000..16d1dac --- /dev/null +++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.builder; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Properties; + +import junit.framework.TestCase; +import opennlp.morfologik.lemmatizer.MorfologikLemmatizer; + +import org.junit.Test; + +public class POSDictionayBuilderTest extends TestCase { + + @Test + public void testBuildDictionary() throws Exception { + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + File dictInFile = new File(POSDictionayBuilderTest.class.getResource( + "/dictionaryWithLemma.txt").getFile()); + + File dictOutFile = File.createTempFile( + POSDictionayBuilderTest.class.getName(), ".dict"); + + builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true, + true); + + MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI() + .toURL()); + + assertNotNull(ml); + } + + @Test + public void testPropertiesCreation() throws Exception { + + Charset c = Charset.forName("iso-8859-1"); + String sep = "_"; + boolean pref = true; + boolean inf = true; + Properties p = createPropertiesHelper(c, sep, pref, inf); + + assertEquals(c.name(), p.getProperty("fsa.dict.encoding")); + assertEquals(sep, p.getProperty("fsa.dict.separator")); + assertEquals(pref, + Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes"))); + assertEquals(inf, + Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes"))); + + pref = false; + inf = true; + p = createPropertiesHelper(c, sep, pref, inf); + assertEquals(pref, + Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes"))); + assertEquals(inf, + Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes"))); + + pref = true; + inf = false; + p = createPropertiesHelper(c, sep, pref, inf); + assertEquals(pref, + Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes"))); + assertEquals(inf, + Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes"))); + } + + private Properties createPropertiesHelper(Charset c, String sep, + boolean pref, boolean inf) throws IOException { + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + File f = File.createTempFile(POSDictionayBuilderTest.class.getName(), + ".info"); + builder.createProperties(c, sep, pref, inf, f); + + InputStream is = new FileInputStream(f); + + Properties prop = new Properties(); + prop.load(is); + is.close(); + f.delete(); + return prop; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java new file mode 100644 index 0000000..6fd6ec1 --- /dev/null +++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java @@ -0,0 +1,46 @@ +package opennlp.morfologik.lemmatizer; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.nio.charset.Charset; + +import opennlp.morfologik.builder.MorfologikDictionayBuilder; +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.lemmatizer.DictionaryLemmatizer; + +import org.junit.Test; + +public class MorfologikLemmatizerTest { + + @Test + public void testLemmatizeInsensitive() throws Exception { + DictionaryLemmatizer dict = createDictionary(false); + + assertEquals("casar", dict.lemmatize("casa", "V")); + assertEquals("casa", dict.lemmatize("casa", "NOUN")); + + assertEquals("casa", dict.lemmatize("Casa", "PROP")); + + } + + private MorfologikLemmatizer createDictionary(boolean caseSensitive) + throws Exception { + + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + File dictInFile = new File(POSDictionayBuilderTest.class.getResource( + "/dictionaryWithLemma.txt").getFile()); + + File dictOutFile = File.createTempFile( + POSDictionayBuilderTest.class.getName(), ".dict"); + + builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true, + true); + + MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI() + .toURL()); + + return ml; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java new file mode 100644 index 0000000..def97b6 --- /dev/null +++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java @@ -0,0 +1,92 @@ +package opennlp.morfologik.tagdict; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; + +import morfologik.stemming.Dictionary; +import opennlp.morfologik.builder.MorfologikDictionayBuilder; +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.morfologik.tagdict.MorfologikTagDictionary; +import opennlp.tools.postag.TagDictionary; + +import org.junit.Test; + +public class MorfologikTagDictionaryTest { + + @Test + public void testNoLemma() throws Exception { + MorfologikTagDictionary dict = createDictionary(false); + + List<String> tags = Arrays.asList(dict.getTags("carro")); + assertEquals(1, tags.size()); + assertTrue(tags.contains("NOUN")); + + } + + @Test + public void testPOSDictionaryInsensitive() throws Exception { + TagDictionary dict = createDictionary(false); + + List<String> tags = Arrays.asList(dict.getTags("casa")); + assertEquals(2, tags.size()); + assertTrue(tags.contains("NOUN")); + assertTrue(tags.contains("V")); + + // this is the behavior of case insensitive dictionary + // if we search it using case insensitive, Casa as a proper noun + // should be lower case in the dictionary + tags = Arrays.asList(dict.getTags("Casa")); + assertEquals(2, tags.size()); + assertTrue(tags.contains("NOUN")); + assertTrue(tags.contains("V")); + + } + + @Test + public void testPOSDictionarySensitive() throws Exception { + TagDictionary dict = createDictionary(true); + + List<String> tags = Arrays.asList(dict.getTags("casa")); + assertEquals(2, tags.size()); + assertTrue(tags.contains("NOUN")); + assertTrue(tags.contains("V")); + + // this is the behavior of case insensitive dictionary + // if we search it using case insensitive, Casa as a proper noun + // should be lower case in the dictionary + tags = Arrays.asList(dict.getTags("Casa")); + assertEquals(1, tags.size()); + assertTrue(tags.contains("PROP")); + + } + + private MorfologikTagDictionary createDictionary(boolean caseSensitive) + throws Exception { + return this.createDictionary(caseSensitive, null); + } + + private MorfologikTagDictionary createDictionary(boolean caseSensitive, + List<String> constant) throws Exception { + + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + File dictInFile = new File(POSDictionayBuilderTest.class.getResource( + "/dictionaryWithLemma.txt").getFile()); + + File dictOutFile = File.createTempFile( + POSDictionayBuilderTest.class.getName(), ".dict"); + + builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true, + true); + + MorfologikTagDictionary ml = new MorfologikTagDictionary( + Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive); + + return ml; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/resources/dictionaryWithLemma.txt ---------------------------------------------------------------------- diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt new file mode 100644 index 0000000..5ac7111 --- /dev/null +++ b/src/test/resources/dictionaryWithLemma.txt @@ -0,0 +1,10 @@ +casa casa NOUN +casa casar V +Casa Casa PROP +casinha casa NOUN +casona casa NOUN +menina menino NOUN +menino menino NOUN +meninão menino NOUN +menininho menino NOUN +carro NOUN
