Author: joern
Date: Wed Apr 15 13:40:38 2015
New Revision: 1673762
URL: http://svn.apache.org/r1673762
Log:
OPENNLP-765 Added CONLL-X Pos Tagger performance tests
Added:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
Modified:
opennlp/trunk/opennlp-tools/pom.xml
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
Modified: opennlp/trunk/opennlp-tools/pom.xml
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/pom.xml?rev=1673762&r1=1673761&r2=1673762&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/pom.xml (original)
+++ opennlp/trunk/opennlp-tools/pom.xml Wed Apr 15 13:40:38 2015
@@ -76,7 +76,10 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
- <argLine>-Xmx512m</argLine>
+ <argLine>-Xmx1024m</argLine>
+ <excludes>
+ <exclude>/opennlp/tools/eval/**/*</exclude>
+ </excludes>
</configuration>
</plugin>
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java?rev=1673762&r1=1673761&r2=1673762&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
Wed Apr 15 13:40:38 2015
@@ -46,7 +46,7 @@ public class ConllXPOSSampleStream exten
super(new ParagraphStream(lineStream));
}
- ConllXPOSSampleStream(InputStreamFactory in, Charset charset) throws
IOException {
+ public ConllXPOSSampleStream(InputStreamFactory in, Charset charset) throws
IOException {
super(new ParagraphStream(new PlainTextByLineStream(in, charset)));
}
Added:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java?rev=1673762&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
(added)
+++
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
Wed Apr 15 13:40:38 2015
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import opennlp.tools.formats.ConllXPOSSampleStream;
+import opennlp.tools.postag.POSEvaluator;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Evaluates the POS Tagger on the CONLL-X data. The CONLL-X data includes
training and evaluation data for
+ * Danish, Dutch, Portuguese and Swedish.
+ * <p>
+ * The following files are needed in the data directory to run this test:
+ * conllx/data/danish/ddt/train/danish_ddt_train.conll<br>
+ * conllx/data/danish/ddt/test/danish_ddt_test.conll<br>
+ * conllx/data/dutch/alpino/train/dutch_alpino_train.conll<br>
+ * conllx/data/dutch/alpino/test/dutch_alpino_test.conll<br>
+ * conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll<br>
+ * conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll<br>
+ * conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll<br>
+ * conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll<br>
+ * <p>
+ * The structure follows the structure of the CONLL-X data distribution. There
is
+ * one package for each language, and an extra package containing the tests
for all
+ * languages.
+ */
+public class ConllXPosTaggerEval {
+
+ private static POSModel train(File trainFile, String lang,
+ TrainingParameters params) throws IOException {
+
+ ObjectStream<POSSample> samples =
+ new ConllXPOSSampleStream(new
MarkableFileInputStreamFactory(trainFile), Charset.forName("UTF-8"));
+
+ return POSTaggerME.train(lang, samples, params, new POSTaggerFactory());
+ }
+
+ private static void eval(POSModel model, File testData,
+ double expectedAccuracy) throws IOException {
+
+ ObjectStream<POSSample> samples = new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(testData),
Charset.forName("UTF-8"));
+
+ POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model));
+ evaluator.evaluate(samples);
+
+ Assert.assertEquals(expectedAccuracy, evaluator.getWordAccuracy(), 0.0001);
+ }
+
+ @Test
+ public void evalDanish() throws IOException {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+
+ POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ "conllx/data/danish/ddt/train/danish_ddt_train.conll"), "da", params);
+
+ eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ "conllx/data/danish/ddt/test/danish_ddt_test.conll"),
0.9512987012987013d);
+ }
+
+ @Test
+ public void evalDutch() throws IOException {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+
+ POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ "conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nl",
params);
+
+ eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ "conllx/data/dutch/alpino/test/dutch_alpino_test.conll"),
0.9174574753804834d);
+ }
+
+ @Test
+ public void evalPortuguese() throws IOException {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+
+ POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "pt",
params);
+
+ eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ "conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"),
0.9659110277825124d);
+ }
+
+ @Test
+ public void evalSwedish() throws IOException {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+
+ POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"), "se",
params);
+
+ eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"),
0.9275106082036775d);
+ }
+}
Added:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java?rev=1673762&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
(added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
Wed Apr 15 13:40:38 2015
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+
+public class EvalUtil {
+
+ public static final File getOpennlpDataDir() {
+ return new File(System.getProperty("OPENNLP_DATA_DIR"));
+ }
+}