OPENNLP-622 Preparing to migrate morfologik-addon to main repository
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/772f31ff Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/772f31ff Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/772f31ff Branch: refs/heads/trunk Commit: 772f31ffe764afb675670735be556796781bda8d Parents: 0cced84 Author: William Colen <[email protected]> Authored: Wed Nov 9 18:23:28 2016 -0200 Committer: William Colen <[email protected]> Committed: Wed Nov 9 18:23:28 2016 -0200 ---------------------------------------------------------------------- bin/morfologik-addon | 20 -- bin/morfologik-addon.bat | 21 -- opennlp-morfologik-addon/bin/morfologik-addon | 20 ++ .../bin/morfologik-addon.bat | 21 ++ opennlp-morfologik-addon/pom.xml | 109 +++++++++ .../src/main/assembly/bin.xml | 91 ++++++++ .../src/main/assembly/src.xml | 39 ++++ .../src/main/bin/morfologik-addon | 35 +++ .../src/main/bin/morfologik-addon.bat | 47 ++++ .../src/main/bin/opennlp-cp | 35 +++ .../builder/MorfologikDictionayBuilder.java | 103 +++++++++ .../java/opennlp/morfologik/cmdline/CLI.java | 164 +++++++++++++ .../MorfologikDictionaryBuilderParams.java | 57 +++++ .../MorfologikDictionaryBuilderTool.java | 62 +++++ .../builder/XMLDictionaryToTableParams.java | 45 ++++ .../builder/XMLDictionaryToTableTool.java | 127 ++++++++++ .../lemmatizer/MorfologikLemmatizer.java | 96 ++++++++ .../tagdict/MorfologikPOSTaggerFactory.java | 170 ++++++++++++++ .../tagdict/MorfologikTagDictionary.java | 90 ++++++++ .../opennlp/morfologik/util/MorfologikUtil.java | 36 +++ .../src/main/readme/LICENSE | 230 +++++++++++++++++++ .../src/main/readme/MORFOLOGIK-LICENSE | 28 +++ opennlp-morfologik-addon/src/main/readme/NOTICE | 11 + .../builder/POSDictionayBuilderTest.java | 58 +++++ .../lemmatizer/MorfologikLemmatizerTest.java | 35 +++ .../tagdict/MorfologikTagDictionaryTest.java | 78 +++++++ .../tagdict/POSTaggerFactoryTest.java | 88 +++++++ .../src/test/resources/AnnotatedSentences.txt | 136 +++++++++++ .../src/test/resources/dictionaryWithLemma.info | 15 ++ .../src/test/resources/dictionaryWithLemma.txt | 11 + pom.xml | 109 --------- src/main/assembly/bin.xml | 91 -------- src/main/assembly/src.xml | 39 ---- src/main/bin/morfologik-addon | 35 --- src/main/bin/morfologik-addon.bat | 47 ---- src/main/bin/opennlp-cp | 35 --- .../builder/MorfologikDictionayBuilder.java | 103 --------- .../java/opennlp/morfologik/cmdline/CLI.java | 164 ------------- .../MorfologikDictionaryBuilderParams.java | 57 ----- .../MorfologikDictionaryBuilderTool.java | 62 ----- .../builder/XMLDictionaryToTableParams.java | 45 ---- .../builder/XMLDictionaryToTableTool.java | 127 ---------- .../lemmatizer/MorfologikLemmatizer.java | 96 -------- .../tagdict/MorfologikPOSTaggerFactory.java | 170 -------------- .../tagdict/MorfologikTagDictionary.java | 90 -------- .../opennlp/morfologik/util/MorfologikUtil.java | 36 --- src/main/readme/LICENSE | 230 ------------------- src/main/readme/MORFOLOGIK-LICENSE | 28 --- src/main/readme/NOTICE | 11 - .../builder/POSDictionayBuilderTest.java | 58 ----- .../lemmatizer/MorfologikLemmatizerTest.java | 35 --- .../tagdict/MorfologikTagDictionaryTest.java | 78 ------- .../tagdict/POSTaggerFactoryTest.java | 88 ------- src/test/resources/AnnotatedSentences.txt | 136 ----------- src/test/resources/dictionaryWithLemma.info | 15 -- src/test/resources/dictionaryWithLemma.txt | 11 - 56 files changed, 2037 insertions(+), 2037 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/bin/morfologik-addon ---------------------------------------------------------------------- diff --git a/bin/morfologik-addon b/bin/morfologik-addon deleted file mode 100755 index ccc635e..0000000 --- a/bin/morfologik-addon +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*" http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/bin/morfologik-addon.bat ---------------------------------------------------------------------- diff --git a/bin/morfologik-addon.bat b/bin/morfologik-addon.bat deleted file mode 100644 index 26a4778..0000000 --- a/bin/morfologik-addon.bat +++ /dev/null @@ -1,21 +0,0 @@ -@ECHO OFF - -REM # Licensed to the Apache Software Foundation (ASF) under one -REM # or more contributor license agreements. See the NOTICE file -REM # distributed with this work for additional information -REM # regarding copyright ownership. The ASF licenses this file -REM # to you under the Apache License, Version 2.0 (the -REM # "License"); you may not use this file except in compliance -REM # with the License. You may obtain a copy of the License at -REM # -REM # http://www.apache.org/licenses/LICENSE-2.0 -REM # -REM # Unless required by applicable law or agreed to in writing, -REM # software distributed under the License is distributed on an -REM # -REM # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -REM # KIND, either express or implied. See the License for the -REM # specific language governing permissions and limitations -REM # under the License. - -mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*" http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/bin/morfologik-addon ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/bin/morfologik-addon b/opennlp-morfologik-addon/bin/morfologik-addon new file mode 100755 index 0000000..ccc635e --- /dev/null +++ b/opennlp-morfologik-addon/bin/morfologik-addon @@ -0,0 +1,20 @@ +#!/bin/sh + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*" http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/bin/morfologik-addon.bat ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/bin/morfologik-addon.bat b/opennlp-morfologik-addon/bin/morfologik-addon.bat new file mode 100644 index 0000000..26a4778 --- /dev/null +++ b/opennlp-morfologik-addon/bin/morfologik-addon.bat @@ -0,0 +1,21 @@ +@ECHO OFF + +REM # Licensed to the Apache Software Foundation (ASF) under one +REM # or more contributor license agreements. See the NOTICE file +REM # distributed with this work for additional information +REM # regarding copyright ownership. The ASF licenses this file +REM # to you under the Apache License, Version 2.0 (the +REM # "License"); you may not use this file except in compliance +REM # with the License. You may obtain a copy of the License at +REM # +REM # http://www.apache.org/licenses/LICENSE-2.0 +REM # +REM # Unless required by applicable law or agreed to in writing, +REM # software distributed under the License is distributed on an +REM # +REM # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +REM # KIND, either express or implied. See the License for the +REM # specific language governing permissions and limitations +REM # under the License. + +mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*" http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/pom.xml ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml new file mode 100644 index 0000000..56d0e47 --- /dev/null +++ b/opennlp-morfologik-addon/pom.xml @@ -0,0 +1,109 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.apache.opennlp</groupId> + <artifactId>morfologik-addon</artifactId> + <version>1.0-SNAPSHOT</version> + <packaging>jar</packaging> + <name>Morfologik Addon</name> + + <url>http://maven.apache.org</url> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <source>1.7</source> + <target>1.7</target> + </configuration> + </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>bundle-project-sources</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <descriptors> + <descriptor>src/main/assembly/bin.xml</descriptor> + <descriptor>src/main/assembly/src.xml</descriptor> + </descriptors> + <!-- Tar package is only compatible with gnu tar, + many file have more than 100 chars. + Right now only javadoc files are too long. + --> + <tarLongFileMode>gnu</tarLongFileMode> + + <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-antrun-plugin</artifactId> + <version>1.6</version> + <executions> + <execution> + <id>generate checksums for binary artifacts</id> + <goals><goal>run</goal></goals> + <phase>verify</phase> + <configuration> + <target> + <checksum algorithm="sha1" format="MD5SUM"> + <fileset dir="${project.build.directory}"> + <include name="*.zip" /> + <include name="*.gz" /> + </fileset> + </checksum> + <checksum algorithm="md5" format="MD5SUM"> + <fileset dir="${project.build.directory}"> + <include name="*.zip" /> + <include name="*.gz" /> + </fileset> + </checksum> + </target> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.carrot2</groupId> + <artifactId>morfologik-stemming</artifactId> + <version>2.1.0</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.carrot2</groupId> + <artifactId>morfologik-tools</artifactId> + <version>2.1.0</version> + <scope>compile</scope> + </dependency> + + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + <version>1.6.0</version> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.8.1</version> + <scope>test</scope> + </dependency> + + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/assembly/bin.xml ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/assembly/bin.xml b/opennlp-morfologik-addon/src/main/assembly/bin.xml new file mode 100644 index 0000000..ab4f6da --- /dev/null +++ b/opennlp-morfologik-addon/src/main/assembly/bin.xml @@ -0,0 +1,91 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<assembly> + <id>bin</id> + <formats> + <format>tar.gz</format> + <format>zip</format> + <format>dir</format> + </formats> + + <includeBaseDirectory>true</includeBaseDirectory> + <baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory> + + <dependencySets> + <dependencySet> + <scope>runtime</scope> + <unpack>false</unpack> + <useProjectArtifact>false</useProjectArtifact> + <fileMode>644</fileMode> + <directoryMode>755</directoryMode> + <outputDirectory>lib</outputDirectory> + <useTransitiveDependencies>true</useTransitiveDependencies> + </dependencySet> + </dependencySets> + + <fileSets> + <fileSet> + <directory>src/main/readme</directory> + <outputDirectory></outputDirectory> + <fileMode>644</fileMode> + <directoryMode>755</directoryMode> + </fileSet> + + <fileSet> + <directory>.</directory> + <outputDirectory></outputDirectory> + <filtered>true</filtered> + <fileMode>644</fileMode> + <directoryMode>755</directoryMode> + <includes> + <include>README</include> + <include>RELEASE_NOTES.html</include> + </includes> + </fileSet> + + <fileSet> + <directory>target</directory> + <outputDirectory></outputDirectory> + <fileMode>644</fileMode> + <directoryMode>755</directoryMode> + <includes> + <include>issuesFixed/**</include> + </includes> + </fileSet> + + <fileSet> + <directory>src/main/bin</directory> + <fileMode>755</fileMode> + <directoryMode>755</directoryMode> + <outputDirectory>bin</outputDirectory> + </fileSet> + + <fileSet> + <directory>target</directory> + <outputDirectory>lib</outputDirectory> + <includes> + <include>morfologik-addon-*.jar</include> + </includes> + </fileSet> + + </fileSets> +</assembly> http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/assembly/src.xml ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/assembly/src.xml b/opennlp-morfologik-addon/src/main/assembly/src.xml new file mode 100644 index 0000000..cdcc9d3 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/assembly/src.xml @@ -0,0 +1,39 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<assembly> + <id>src</id> + <formats> + <format>tar.gz</format> + <format>zip</format> + </formats> + + <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory> + + <fileSets> + <fileSet> + <directory>../</directory> + <outputDirectory></outputDirectory> + <excludes> + <exclude>**/target/**</exclude> + <exclude>**/.*/**</exclude> + <exclude>**/pom.xml.releaseBackup</exclude> + <exclude>**/release.properties</exclude> + </excludes> + </fileSet> + </fileSets> +</assembly> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/morfologik-addon ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/bin/morfologik-addon b/opennlp-morfologik-addon/src/main/bin/morfologik-addon new file mode 100755 index 0000000..9b0faf9 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/bin/morfologik-addon @@ -0,0 +1,35 @@ +#!/bin/sh + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Note: Do not output anything in this script file, any output +# may be inadvertantly placed in any output files if +# output redirection is used. + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + JAVACMD="$JAVA_HOME/bin/java" + else + JAVACMD="`which java`" + fi +fi + +# Might fail if $0 is a link +OPENNLP_HOME=`dirname "$0"`/.. + +$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@ http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat b/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat new file mode 100644 index 0000000..aeec31f --- /dev/null +++ b/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat @@ -0,0 +1,47 @@ +@ECHO off + +REM # Licensed to the Apache Software Foundation (ASF) under one +REM # or more contributor license agreements. See the NOTICE file +REM # distributed with this work for additional information +REM # regarding copyright ownership. The ASF licenses this file +REM # to you under the Apache License, Version 2.0 (the +REM # "License"); you may not use this file except in compliance +REM # with the License. You may obtain a copy of the License at +REM # +REM # http://www.apache.org/licenses/LICENSE-2.0 +REM # +REM # Unless required by applicable law or agreed to in writing, +REM # software distributed under the License is distributed on an +REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +REM # KIND, either express or implied. See the License for the +REM # specific language governing permissions and limitations +REM # under the License. + +REM # Note: Do not output anything in this script file, any output +REM # may be inadvertantly placed in any output files if +REM # output redirection is used. +SETLOCAL + +IF "%JAVA_CMD%" == "" ( + IF "%JAVA_HOME%" == "" ( + SET JAVA_CMD=java + ) ELSE ( + REM # Keep JAVA_HOME to short-name without spaces + FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java + ) +) + +REM # Should work with Windows XP and greater. If not, specify the path to where it is installed. +IF "%OPENNLP_HOME%" == "" ( + SET OPENNLP_HOME=%~sp0.. +) ELSE ( + REM # Keep OPENNLP_HOME to short-name without spaces + FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA +) + +REM # Get the library JAR file name (JIRA OPENNLP-554) +FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A + +%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %* + +ENDLOCAL \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/opennlp-cp ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/bin/opennlp-cp b/opennlp-morfologik-addon/src/main/bin/opennlp-cp new file mode 100755 index 0000000..dff0d12 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/bin/opennlp-cp @@ -0,0 +1,35 @@ +#!/bin/sh + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Note: Do not output anything in this script file, any output +# may be inadvertantly placed in any output files if +# output redirection is used. + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + JAVACMD="$JAVA_HOME/bin/java" + else + JAVACMD="`which java`" + fi +fi + +# Might fail if $0 is a link +OPENNLP_HOME=`dirname "$0"`/.. + +$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@ http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java new file mode 100644 index 0000000..dbbca4d --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.builder; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.Properties; + +import morfologik.stemming.DictionaryMetadata; +import morfologik.stemming.EncoderType; +import morfologik.tools.DictCompile; + +/** + * Utility class to build Morfologik dictionaries from a tab separated values + * file. The first column is the word, the second its lemma and the third a POS + * tag. If there is no lemma information leave the second column empty. + */ +public class MorfologikDictionayBuilder { + + /** + * Helper to compile a morphological dictionary automaton. + * + * @param input + * The input file (base,inflected,tag). An associated metadata + * (*.info) file must exist. + * @param overwrite + * Overwrite the output file if it exists. + * @param validate + * Validate input to make sure it makes sense. + * @param acceptBom + * Accept leading BOM bytes (UTF-8). + * @param acceptCr + * Accept CR bytes in input sequences (\r). + * @param ignoreEmpty + * Ignore empty lines in the input. + * @return the dictionary path + * + * @throws Exception + */ + public Path build(Path input, boolean overwrite, boolean validate, + boolean acceptBom, boolean acceptCr, boolean ignoreEmpty) + throws Exception { + + DictCompile compiler = new DictCompile(input, overwrite, validate, + acceptBom, acceptCr, ignoreEmpty); + compiler.call(); + + + Path metadataPath = DictionaryMetadata + .getExpectedMetadataLocation(input); + + return metadataPath.resolveSibling( + metadataPath.getFileName().toString().replaceAll( + "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict")); + } + + /** + * Helper to compile a morphological dictionary automaton using default + * parameters. + * + * @param input + * The input file (base,inflected,tag). An associated metadata + * (*.info) file must exist. + * + * @return the dictionary path + * + * @throws Exception + */ + public Path build(Path input) throws Exception { + + return build(input, true, true, false, false, false); + + } + + Properties createProperties(Charset encoding, String separator, + EncoderType encoderType) throws FileNotFoundException, IOException { + + Properties properties = new Properties(); + properties.setProperty("fsa.dict.separator", separator); + properties.setProperty("fsa.dict.encoding", encoding.name()); + properties.setProperty("fsa.dict.encoder", encoderType.name()); + + return properties; + + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java new file mode 100644 index 0000000..f92d178 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool; +import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool; +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CmdLineTool; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.TypedCmdLineTool; +import opennlp.tools.util.Version; + +public final class CLI { + + public static final String CMD = "opennlp-morfologik-addon"; + + private static Map<String, CmdLineTool> toolLookupMap; + + static { + toolLookupMap = new LinkedHashMap<String, CmdLineTool>(); + + List<CmdLineTool> tools = new LinkedList<CmdLineTool>(); + + tools.add(new MorfologikDictionaryBuilderTool()); + tools.add(new XMLDictionaryToTableTool()); + + for (CmdLineTool tool : tools) { + toolLookupMap.put(tool.getName(), tool); + } + + toolLookupMap = Collections.unmodifiableMap(toolLookupMap); + } + + /** + * @return a set which contains all tool names + */ + public static Set<String> getToolNames() { + return toolLookupMap.keySet(); + } + + private static void usage() { + System.out.print("OpenNLP Morfologik Addon " + + Version.currentVersion().toString() + ". "); + System.out.println("Usage: " + CMD + " TOOL"); + System.out.println("where TOOL is one of:"); + + // distance of tool name from line start + int numberOfSpaces = -1; + for (String toolName : toolLookupMap.keySet()) { + if (toolName.length() > numberOfSpaces) { + numberOfSpaces = toolName.length(); + } + } + numberOfSpaces = numberOfSpaces + 4; + + for (CmdLineTool tool : toolLookupMap.values()) { + + System.out.print(" " + tool.getName()); + + for (int i = 0; i < Math.abs(tool.getName().length() + - numberOfSpaces); i++) { + System.out.print(" "); + } + + System.out.println(tool.getShortDescription()); + } + + System.out + .println("All tools print help when invoked with help parameter"); + System.out + .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help"); + } + + + @SuppressWarnings("rawtypes") + public static void main(String[] args) { + + if (args.length == 0) { + usage(); + System.exit(0); + } + + String toolArguments[] = new String[args.length -1]; + System.arraycopy(args, 1, toolArguments, 0, toolArguments.length); + + String toolName = args[0]; + + //check for format + String formatName = StreamFactoryRegistry.DEFAULT_FORMAT; + int idx = toolName.indexOf("."); + if (-1 < idx) { + formatName = toolName.substring(idx + 1); + toolName = toolName.substring(0, idx); + } + CmdLineTool tool = toolLookupMap.get(toolName); + + try { + if (null == tool) { + throw new TerminateToolException(1, "Tool " + toolName + " is not found."); + } + + if ((0 == toolArguments.length && tool.hasParams()) || + 0 < toolArguments.length && "help".equals(toolArguments[0])) { + if (tool instanceof TypedCmdLineTool) { + System.out.println(((TypedCmdLineTool) tool).getHelp(formatName)); + } else if (tool instanceof BasicCmdLineTool) { + System.out.println(tool.getHelp()); + } + + System.exit(0); + } + + if (tool instanceof TypedCmdLineTool) { + ((TypedCmdLineTool) tool).run(formatName, toolArguments); + } else if (tool instanceof BasicCmdLineTool) { + if (-1 == idx) { + ((BasicCmdLineTool) tool).run(toolArguments); + } else { + throw new TerminateToolException(1, "Tool " + toolName + " does not support formats."); + } + } else { + throw new TerminateToolException(1, "Tool " + toolName + " is not supported."); + } + } + catch (TerminateToolException e) { + + if (e.getMessage() != null) { + System.err.println(e.getMessage()); + } + + if (e.getCause() != null) { + System.err.println(e.getCause().getMessage()); + e.getCause().printStackTrace(System.err); + } + + System.exit(e.getCode()); + } + } + + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java new file mode 100644 index 0000000..5ea2e4f --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.File; + +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.params.EncodingParameter; + +/** + * Params for Dictionary tools. + */ +interface MorfologikDictionaryBuilderParams extends EncodingParameter { + + @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.") + File getInputFile(); + + @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).") + @OptionalParameter(defaultValue="false") + Boolean getAcceptBOM(); + + @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).") + @OptionalParameter(defaultValue="false") + Boolean getAcceptCR(); + + @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.") + @OptionalParameter(defaultValue="FSA5") + String getFormat(); + + @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.") + @OptionalParameter(defaultValue="false") + Boolean getIgnoreEmpty(); + + @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.") + @OptionalParameter(defaultValue="false") + Boolean getOverwrite(); + + @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.") + @OptionalParameter(defaultValue="false") + Boolean getValidate(); +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java new file mode 100644 index 0000000..eb9b51c --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.File; +import java.nio.file.Path; + +import morfologik.stemming.DictionaryMetadata; +import opennlp.morfologik.builder.MorfologikDictionayBuilder; +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; + +public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool { + + interface Params extends MorfologikDictionaryBuilderParams { + } + + public String getShortDescription() { + return "builds a binary POS Dictionary using Morfologik"; + } + + public String getHelp() { + return getBasicHelp(Params.class); + } + + public void run(String[] args) { + Params params = validateAndParseParams(args, Params.class); + + File dictInFile = params.getInputFile(); + + CmdLineUtil.checkInputFile("dictionary input file", dictInFile); + Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath()); + CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile()); + + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + try { + builder.build(dictInFile.toPath(), params.getOverwrite(), + params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(), + params.getIgnoreEmpty()); + } catch (Exception e) { + throw new TerminateToolException(-1, + "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e); + } + + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java new file mode 100644 index 0000000..4ee8cd4 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.File; + +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.params.EncodingParameter; + +/** + * Params for Dictionary tools. + */ +interface XMLDictionaryToTableParams extends EncodingParameter { + + @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.") + File getInputFile(); + + @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).") + File getOutputFile(); + + @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)") + @OptionalParameter(defaultValue=",") + String getSeparator(); + + @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].") + @OptionalParameter(defaultValue="prefix") + String getEncoder(); + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java new file mode 100644 index 0000000..0e7f2d5 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.cmdline.builder; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Properties; + +import morfologik.stemming.DictionaryMetadata; +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.postag.POSDictionary; + +public class XMLDictionaryToTableTool extends BasicCmdLineTool { + + interface Params extends XMLDictionaryToTableParams { + } + + private String SEPARATOR; + + public String getShortDescription() { + return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file"; + } + + public String getHelp() { + return getBasicHelp(Params.class); + } + + public void run(String[] args) { + Params params = validateAndParseParams(args, Params.class); + + File dictInFile = params.getInputFile(); + File dictOutFile = params.getOutputFile(); + Charset encoding = params.getEncoding(); + SEPARATOR = params.getSeparator(); + + CmdLineUtil.checkInputFile("dictionary input file", dictInFile); + CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); + + POSDictionary tagDictionary = null; + try { + tagDictionary = POSDictionary.create(new FileInputStream(dictInFile)); + } catch (IOException e) { + throw new TerminateToolException(-1, + "Error while loading XML POS Dictionay: " + e.getMessage(), e); + } + Iterator<String> iterator = tagDictionary.iterator(); + + try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(), + encoding)) { + while (iterator.hasNext()) { + String word = iterator.next(); + for (String tag : tagDictionary.getTags(word)) { + if(valid(word,tag)) { + String entry = createEntry(word, tag); + writer.write(entry); + writer.newLine(); + } + } + } + writer.close(); + System.out.println("Created dictionary: " + dictOutFile.toPath()); + } catch (IOException e) { + throw new TerminateToolException(-1, "Error while writing output: " + + e.getMessage(), e); + } + + Properties info = new Properties(); + info.setProperty("fsa.dict.separator", SEPARATOR); + info.setProperty("fsa.dict.encoding", params.getEncoding().name()); + info.setProperty("fsa.dict.encoder", params.getEncoder()); + + Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath()); + + try { + info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary."); + } catch (IOException e) { + throw new TerminateToolException(-1, "Error while writing metadata output: " + + e.getMessage(), e); + } + System.out.println("Created metadata: " + dictOutFile.toPath()); + + } + + private boolean valid(String word, String tag) { + if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) { + System.out + .println("Warn: invalid entry because contains separator - word: " + + word + " tag: " + tag); + return false; + } + + return true; + } + + private String createEntry(String word, String tag) { + + String entry = "" + SEPARATOR +// base + word + SEPARATOR + + tag; + + return entry; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java new file mode 100644 index 0000000..2090ce5 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.lemmatizer; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; +import opennlp.tools.lemmatizer.DictionaryLemmatizer; + +public class MorfologikLemmatizer implements DictionaryLemmatizer { + + private IStemmer dictLookup; + public final Set<String> constantTags = new HashSet<String>(Arrays.asList( + "NNP", "NP00000")); + + public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException, + IOException { + dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath)); + } + + private HashMap<List<String>, String> getLemmaTagsDict(String word) { + List<WordData> wdList = dictLookup.lookup(word); + HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>(); + for (WordData wd : wdList) { + List<String> wordLemmaTags = new ArrayList<String>(); + wordLemmaTags.add(word); + wordLemmaTags.add(wd.getTag().toString()); + dictMap.put(wordLemmaTags, wd.getStem().toString()); + } + return dictMap; + } + + private List<String> getDictKeys(String word, String postag) { + List<String> keys = new ArrayList<String>(); + if (constantTags.contains(postag)) { + keys.addAll(Arrays.asList(word, postag)); + } else { + keys.addAll(Arrays.asList(word.toLowerCase(), postag)); + } + return keys; + } + + private HashMap<List<String>, String> getDictMap(String word, String postag) { + HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>(); + + if (constantTags.contains(postag)) { + dictMap = this.getLemmaTagsDict(word); + } else { + dictMap = this.getLemmaTagsDict(word.toLowerCase()); + } + return dictMap; + } + + public String lemmatize(String word, String postag) { + String lemma = null; + List<String> keys = this.getDictKeys(word, postag); + HashMap<List<String>, String> dictMap = this.getDictMap(word, postag); + // lookup lemma as value of the map + String keyValue = dictMap.get(keys); + if (keyValue != null) { + lemma = keyValue; + } else if (keyValue == null && constantTags.contains(postag)) { + lemma = word; + } else if (keyValue == null && word.toUpperCase() == word) { + lemma = word; + } else { + lemma = word.toLowerCase(); + } + return lemma; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java new file mode 100644 index 0000000..93d6c61 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.tagdict; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; + +import morfologik.stemming.DictionaryMetadata; +import opennlp.tools.dictionary.Dictionary; +import opennlp.tools.postag.POSTaggerFactory; +import opennlp.tools.postag.TagDictionary; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.model.ArtifactSerializer; +import opennlp.tools.util.model.ModelUtil; + +public class MorfologikPOSTaggerFactory extends POSTaggerFactory { + + private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict"; + private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info"; + + private static final String MORFOLOGIK_POSDICT = "tagdict." + + MORFOLOGIK_POSDICT_SUF; + private static final String MORFOLOGIK_DICT_INFO = "tagdict." + + MORFOLOGIK_DICT_INFO_SUF; + + private TagDictionary dict; + + private byte[] dictInfo; + private byte[] dictData; + + public MorfologikPOSTaggerFactory() { + } + + public TagDictionary createTagDictionary(File dictionary) + throws InvalidFormatException, FileNotFoundException, IOException { + + if(!dictionary.canRead()) { + throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath()); + } + + Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath()); + + if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) { + throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName()); + } + + this.dictData = Files.readAllBytes(dictionary.toPath()); + this.dictInfo = Files.readAllBytes(dictionaryMeta); + + return createMorfologikDictionary(dictData, dictInfo); + + } + + + @Override + protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) { + super.init(ngramDictionary, null); + this.dict = posDictionary; + } + + @Override + public TagDictionary getTagDictionary() { + if (this.dict == null) { + + if (artifactProvider != null) { + Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT); + if (obj != null) { + byte[] data = (byte[]) artifactProvider + .getArtifact(MORFOLOGIK_POSDICT); + byte[] info = (byte[]) artifactProvider + .getArtifact(MORFOLOGIK_DICT_INFO); + + try { + this.dict = createMorfologikDictionary(data, info); + } catch (IllegalArgumentException e) { + throw new RuntimeException( + "Could not load the dictionary files to Morfologik.", e); + } catch (IOException e) { + throw new RuntimeException( + "IO error while reading the Morfologik dictionary files.", e); + } + } + } + } + + return this.dict; + } + + @Override + public void setTagDictionary(TagDictionary dictionary) { + this.dict = dictionary; + } + + @Override + public TagDictionary createEmptyTagDictionary() { + throw new UnsupportedOperationException( + "Morfologik POS Tagger factory does not support this operation"); + } + + @Override + public TagDictionary createTagDictionary(InputStream in) + throws InvalidFormatException, IOException { + throw new UnsupportedOperationException( + "Morfologik POS Tagger factory does not support this operation"); + } + + @Override + @SuppressWarnings("rawtypes") + public Map<String, ArtifactSerializer> createArtifactSerializersMap() { + Map<String, ArtifactSerializer> serializers = super + .createArtifactSerializersMap(); + + serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer()); + serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer()); + + return serializers; + } + + @Override + public Map<String, Object> createArtifactMap() { + Map<String, Object> artifactMap = super.createArtifactMap(); + artifactMap.put(MORFOLOGIK_POSDICT, this.dictData); + artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo); + return artifactMap; + } + + private TagDictionary createMorfologikDictionary(byte[] data, byte[] info) + throws IOException { + morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary + .read(new ByteArrayInputStream(data), new ByteArrayInputStream( + info)); + return new MorfologikTagDictionary(dict); + } + + static class ByteArraySerializer implements ArtifactSerializer<byte[]> { + + public byte[] create(InputStream in) throws IOException, + InvalidFormatException { + + return ModelUtil.read(in); + } + + public void serialize(byte[] artifact, OutputStream out) throws IOException { + out.write(artifact); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java new file mode 100644 index 0000000..b34ca2b --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.tagdict; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; +import opennlp.tools.postag.TagDictionary; + +/** + * A POS Tagger dictionary implementation based on Morfologik binary + * dictionaries + */ +public class MorfologikTagDictionary implements TagDictionary { + + private IStemmer dictLookup; + private boolean isCaseSensitive; + + /** + * Creates a case sensitive {@link MorfologikTagDictionary} + * + * @param dict + * a Morfologik FSA dictionary + * @throws IllegalArgumentException + * if FSA's root node cannot be acquired (dictionary is empty). + * @throws IOException + * could not read dictionary from dictURL + */ + public MorfologikTagDictionary(Dictionary dict) + throws IllegalArgumentException, IOException { + this(dict, true); + } + + /** + * Creates MorfologikLemmatizer + * + * @param dict + * a Morfologik FSA dictionary + * @param caseSensitive + * if true it performs case sensitive lookup + * @throws IllegalArgumentException + * if FSA's root node cannot be acquired (dictionary is empty). + * @throws IOException + * could not read dictionary from dictURL + */ + public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive) + throws IllegalArgumentException, IOException { + this.dictLookup = new DictionaryLookup(dict); + this.isCaseSensitive = caseSensitive; + } + + @Override + public String[] getTags(String word) { + if (!isCaseSensitive) { + word = word.toLowerCase(); + } + + List<WordData> data = dictLookup.lookup(word); + if (data != null && data.size() > 0) { + List<String> tags = new ArrayList<String>(data.size()); + for (int i = 0; i < data.size(); i++) { + tags.add(data.get(i).getTag().toString()); + } + if (tags.size() > 0) + return tags.toArray(new String[tags.size()]); + return null; + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java new file mode 100644 index 0000000..bd4d1a4 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.util; + +import java.io.File; + +import morfologik.stemming.DictionaryMetadata; + +public class MorfologikUtil { + + public static File getExpectedPropertiesFile(File dictFile) { + return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath()) + .toFile(); + } + + public static File getExpectedPropertiesFile(String dictFile) { + File f = new File(dictFile); + return getExpectedPropertiesFile(f); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/LICENSE ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/readme/LICENSE b/opennlp-morfologik-addon/src/main/readme/LICENSE new file mode 100644 index 0000000..576b4cf --- /dev/null +++ b/opennlp-morfologik-addon/src/main/readme/LICENSE @@ -0,0 +1,230 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +The following license applies to the Snowball stemmers: + + Copyright (c) 2001, Dr Martin Porter + Copyright (c) 2002, Richard Boulton + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE new file mode 100644 index 0000000..0554010 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2015 Dawid Weiss, Marcin MiÅkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/NOTICE ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/readme/NOTICE b/opennlp-morfologik-addon/src/main/readme/NOTICE new file mode 100644 index 0000000..73fb1d7 --- /dev/null +++ b/opennlp-morfologik-addon/src/main/readme/NOTICE @@ -0,0 +1,11 @@ +Apache OpenNLP +Copyright 2010, 2013 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +The snowball stemmers in +opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from +http://snowball.tartarus.org/ http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java new file mode 100644 index 0000000..0a7ba48 --- /dev/null +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.builder; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +import junit.framework.TestCase; +import morfologik.stemming.DictionaryMetadata; +import opennlp.morfologik.lemmatizer.MorfologikLemmatizer; + +import org.junit.Test; + +public class POSDictionayBuilderTest extends TestCase { + + @Test + public void testBuildDictionary() throws Exception { + + Path output = createMorfologikDictionary(); + + MorfologikLemmatizer ml = new MorfologikLemmatizer(output); + + assertNotNull(ml); + } + + public static Path createMorfologikDictionary() throws Exception { + Path tabFilePath = File.createTempFile( + POSDictionayBuilderTest.class.getName(), ".txt").toPath(); + Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath); + + Files.copy(POSDictionayBuilderTest.class.getResourceAsStream( + "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING); + Files.copy(POSDictionayBuilderTest.class.getResourceAsStream( + "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING); + + MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); + + return builder.build(tabFilePath); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java new file mode 100644 index 0000000..6b7525e --- /dev/null +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java @@ -0,0 +1,35 @@ +package opennlp.morfologik.lemmatizer; + +import static org.junit.Assert.assertEquals; + +import java.nio.file.Path; + +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.lemmatizer.DictionaryLemmatizer; + +import org.junit.Test; + +public class MorfologikLemmatizerTest { + + @Test + public void testLemmatizeInsensitive() throws Exception { + DictionaryLemmatizer dict = createDictionary(false); + + assertEquals("casar", dict.lemmatize("casa", "V")); + assertEquals("casa", dict.lemmatize("casa", "NOUN")); + + assertEquals("casa", dict.lemmatize("Casa", "PROP")); + + } + + private MorfologikLemmatizer createDictionary(boolean caseSensitive) + throws Exception { + + Path output = POSDictionayBuilderTest.createMorfologikDictionary(); + + MorfologikLemmatizer ml = new MorfologikLemmatizer(output); + + return ml; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java new file mode 100644 index 0000000..c6c9e04 --- /dev/null +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java @@ -0,0 +1,78 @@ +package opennlp.morfologik.tagdict; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.List; + +import morfologik.stemming.Dictionary; +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.postag.TagDictionary; + +import org.junit.Test; + +public class MorfologikTagDictionaryTest { + + @Test + public void testNoLemma() throws Exception { + MorfologikTagDictionary dict = createDictionary(false); + + List<String> tags = Arrays.asList(dict.getTags("carro")); + assertEquals(1, tags.size()); + assertTrue(tags.contains("NOUN")); + + } + + @Test + public void testPOSDictionaryInsensitive() throws Exception { + TagDictionary dict = createDictionary(false); + + List<String> tags = Arrays.asList(dict.getTags("casa")); + assertEquals(2, tags.size()); + assertTrue(tags.contains("NOUN")); + assertTrue(tags.contains("V")); + + // this is the behavior of case insensitive dictionary + // if we search it using case insensitive, Casa as a proper noun + // should be lower case in the dictionary + tags = Arrays.asList(dict.getTags("Casa")); + assertEquals(2, tags.size()); + assertTrue(tags.contains("NOUN")); + assertTrue(tags.contains("V")); + + } + + @Test + public void testPOSDictionarySensitive() throws Exception { + TagDictionary dict = createDictionary(true); + + List<String> tags = Arrays.asList(dict.getTags("casa")); + assertEquals(2, tags.size()); + assertTrue(tags.contains("NOUN")); + assertTrue(tags.contains("V")); + + // this is the behavior of case insensitive dictionary + // if we search it using case insensitive, Casa as a proper noun + // should be lower case in the dictionary + tags = Arrays.asList(dict.getTags("Casa")); + assertEquals(1, tags.size()); + assertTrue(tags.contains("PROP")); + + } + + private MorfologikTagDictionary createDictionary(boolean caseSensitive) + throws Exception { + return this.createDictionary(caseSensitive, null); + } + + private MorfologikTagDictionary createDictionary(boolean caseSensitive, + List<String> constant) throws Exception { + + Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary()); + MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive); + + return ml; + } + +}
