OPENNLP-975: Add format support for CoNLL-U format
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5b8535ba Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5b8535ba Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5b8535ba Branch: refs/heads/parser_regression Commit: 5b8535baa4b79a3be64b4c6829e199cd13cc1bbf Parents: b0a13d9 Author: Jörn Kottmann <jo...@apache.org> Authored: Mon Feb 6 19:43:33 2017 +0100 Committer: Jörn Kottmann <jo...@apache.org> Committed: Thu Apr 20 12:40:18 2017 +0200 ---------------------------------------------------------------------- .../tools/cmdline/StreamFactoryRegistry.java | 5 + .../formats/conllu/ConlluLemmaSampleStream.java | 57 ++++++++ .../conllu/ConlluLemmaSampleStreamFactory.java | 82 ++++++++++++ .../formats/conllu/ConlluPOSSampleStream.java | 56 ++++++++ .../conllu/ConlluPOSSampleStreamFactory.java | 82 ++++++++++++ .../tools/formats/conllu/ConlluSentence.java | 33 +++++ .../tools/formats/conllu/ConlluStream.java | 75 +++++++++++ .../tools/formats/conllu/ConlluTagset.java | 23 ++++ .../tools/formats/conllu/ConlluWordLine.java | 130 +++++++++++++++++++ .../formats/conllu/ConlluWordLineTest.java | 43 ++++++ 10 files changed, 586 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index 56625a9..9977519 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -42,6 +42,8 @@ import opennlp.tools.formats.ad.ADPOSSampleStreamFactory; import opennlp.tools.formats.ad.ADSentenceSampleStreamFactory; import opennlp.tools.formats.ad.ADTokenSampleStreamFactory; import opennlp.tools.formats.brat.BratNameSampleStreamFactory; +import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory; +import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory; import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory; import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory; import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory; @@ -110,6 +112,9 @@ public final class StreamFactoryRegistry { LetsmtSentenceStreamFactory.registerFactory(); MosesSentenceSampleStreamFactory.registerFactory(); + + ConlluPOSSampleStreamFactory.registerFactory(); + ConlluLemmaSampleStreamFactory.registerFactory(); } public static final String DEFAULT_FORMAT = "opennlp"; http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java new file mode 100644 index 0000000..0782120 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.lemmatizer.LemmaSample; +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +public class ConlluLemmaSampleStream extends FilterObjectStream<ConlluSentence, LemmaSample> { + + private final ConlluTagset tagset; + + ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) { + super(samples); + this.tagset = tagset; + } + + @Override + public LemmaSample read() throws IOException { + ConlluSentence sentence = samples.read(); + + if (sentence != null) { + List<String> tokens = new ArrayList<>(); + List<String> tags = new ArrayList<>(); + List<String> lemmas = new ArrayList<>(); + + for (ConlluWordLine line : sentence.getWordLines()) { + tokens.add(line.getForm()); + tags.add(line.getPosTag(tagset)); + lemmas.add(line.getLemma()); + } + + return new LemmaSample(tokens, tags, lemmas); + } + + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java new file mode 100644 index 0000000..4806967 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.lemmatizer.LemmaSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +/** + * <b>Note:</b> Do not use this class, internal use only! + */ +public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> { + + public static final String CONLLU_FORMAT = "conllu"; + + interface Parameters extends BasicFormatParams { + @ArgumentParser.ParameterDescription(valueName = "tagset", + description = "u|x u for unified tags and x for language-specific part-of-speech tags") + @ArgumentParser.OptionalParameter(defaultValue = "u") + String getTagset(); + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(LemmaSample.class, + CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class)); + } + + protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) { + super(params); + } + + public ObjectStream<LemmaSample> create(String[] args) { + Parameters params = ArgumentParser.parse(args, Parameters.class); + + ConlluTagset tagset; + + switch (params.getTagset()) { + case "u": + tagset = ConlluTagset.U; + break; + case "x": + tagset = ConlluTagset.X; + break; + default: + throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset()); + } + + InputStreamFactory inFactory = + CmdLineUtil.createInputStreamFactory(params.getData()); + + try { + return new ConlluLemmaSampleStream(new ConlluStream(inFactory), tagset); + } catch (IOException e) { + // That will throw an exception + CmdLineUtil.handleCreateObjectStreamError(e); + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java new file mode 100644 index 0000000..28dddc0 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +public class ConlluPOSSampleStream extends FilterObjectStream<ConlluSentence, POSSample> { + + private final ConlluTagset tagset; + + ConlluPOSSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) { + super(samples); + this.tagset = Objects.requireNonNull(tagset); + } + + @Override + public POSSample read() throws IOException { + ConlluSentence sentence = samples.read(); + + if (sentence != null) { + List<String> tokens = new ArrayList<>(); + List<String> tags = new ArrayList<>(); + + for (ConlluWordLine line : sentence.getWordLines()) { + tokens.add(line.getForm()); + tags.add(line.getPosTag(tagset)); + } + + return new POSSample(tokens, tags); + } + + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java new file mode 100644 index 0000000..0f9d5f3 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +/** + * <b>Note:</b> Do not use this class, internal use only! + */ +public class ConlluPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> { + + public static final String CONLLU_FORMAT = "conllu"; + + interface Parameters extends BasicFormatParams { + @ArgumentParser.ParameterDescription(valueName = "tagset", + description = "u|x u for unified tags and x for language-specific part-of-speech tags") + @ArgumentParser.OptionalParameter(defaultValue = "u") + String getTagset(); + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(POSSample.class, + CONLLU_FORMAT, new ConlluPOSSampleStreamFactory(Parameters.class)); + } + + protected <P> ConlluPOSSampleStreamFactory(Class<P> params) { + super(params); + } + + public ObjectStream<POSSample> create(String[] args) { + Parameters params = ArgumentParser.parse(args, Parameters.class); + + ConlluTagset tagset; + + switch (params.getTagset()) { + case "u": + tagset = ConlluTagset.U; + break; + case "x": + tagset = ConlluTagset.X; + break; + default: + throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset()); + } + + InputStreamFactory inFactory = + CmdLineUtil.createInputStreamFactory(params.getData()); + + try { + return new ConlluPOSSampleStream(new ConlluStream(inFactory), tagset); + } catch (IOException e) { + // That will throw an exception + CmdLineUtil.handleCreateObjectStreamError(e); + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java new file mode 100644 index 0000000..5d92d89 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.util.List; + +public class ConlluSentence { + + private List<ConlluWordLine> wordLines; + + ConlluSentence(List<ConlluWordLine> wordLines) { + this.wordLines = wordLines; + } + + public List<ConlluWordLine> getWordLines() { + return wordLines; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java new file mode 100644 index 0000000..873a9ed --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.ParagraphStream; +import opennlp.tools.util.PlainTextByLineStream; + +/** + * The CoNNL-U Format is specified here: + * http://universaldependencies.org/format.html + */ +public class ConlluStream implements ObjectStream<ConlluSentence> { + private final ObjectStream<String> sentenceStream; + + public ConlluStream(InputStreamFactory in) throws IOException { + this.sentenceStream = new ParagraphStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); + } + + @Override + public ConlluSentence read() throws IOException { + String sentence = sentenceStream.read(); + + if (sentence != null) { + List<ConlluWordLine> wordLines = new ArrayList<>(); + + BufferedReader reader = new BufferedReader(new StringReader(sentence)); + + String line; + while ((line = reader.readLine()) != null) { + // # indicates a comment line and should be skipped + if (!line.trim().startsWith("#")) { + wordLines.add(new ConlluWordLine(line)); + } + } + + return new ConlluSentence(wordLines); + } + + return null; + } + + @Override + public void close() throws IOException { + sentenceStream.close(); + } + + @Override + public void reset() throws IOException, UnsupportedOperationException { + sentenceStream.reset(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java new file mode 100644 index 0000000..f49f3fd --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +enum ConlluTagset { + U, + X +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java new file mode 100644 index 0000000..9881bf1 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import opennlp.tools.util.InvalidFormatException; + +public class ConlluWordLine { + + private final String id; + private final String form; + private final String lemma; + private final String uPosTag; + private final String xPosTag; + private final String feats; + private final String head; + private final String deprel; + private final String deps; + private final String misc; + + ConlluWordLine(String line) throws InvalidFormatException { + + String[] fields = line.split("\t"); + + if (fields.length != 10) { + throw new InvalidFormatException("Line must have exactly 10 fields"); + } + + id = fields[0]; + form = fields[1]; + lemma = fields[2]; + uPosTag = fields[3]; + xPosTag = fields[4]; + feats = fields[5]; + head = fields[6]; + deprel = fields[7]; + deps = fields[8]; + misc = fields[9]; + } + + /** + * Retrieves the word index. An Integer starting at 1 for each new sentence; + * may be a range for multiword tokens; may be a decimal number for empty nodes. + */ + public String getId() { + return id; + } + + /** + * Retrieve the word form or punctuation symbol. + */ + public String getForm() { + return form; + } + + /** + * Retrieve the lemma or stem of the word form. + */ + public String getLemma() { + return lemma; + } + + /** + * Retrieve the Universal part-of-speech tag or the language-specific part-of-speech tag; + * underscore if not available. + * + * @param tagset the type of tag to retrieve, either universial (u) or language specific (x) + */ + public String getPosTag(ConlluTagset tagset) { + switch (tagset) { + case U: + return uPosTag; + case X: + return xPosTag; + default: + throw new IllegalStateException("Unexpected tagset value: " + tagset); + } + } + + /** + * Retrieve list of morphological features from the universal feature inventory or from a + * defined language-specific extension; underscore if not available. + */ + public String getFeats() { + return feats; + } + + /** + * Head of the current word, which is either a value of ID or zero (0). + */ + public String getHead() { + return head; + } + + /** + * Universal dependency relation to the HEAD (root iff HEAD = 0) or a + * defined language-specific subtype of one. + */ + public String getDeprel() { + return deprel; + } + + /** + * Enhanced dependency graph in the form of a list of head-deprel pairs. + */ + public String getDeps() { + return deps; + } + + /** + * Retrieve any other annotation. + */ + public String getMisc() { + return misc; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5b8535ba/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java new file mode 100644 index 0000000..4676f6f --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.util.InvalidFormatException; + +public class ConlluWordLineTest { + + @Test + public void testParseLine() throws InvalidFormatException { + ConlluWordLine line = new ConlluWordLine( + "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_"); + + Assert.assertEquals("12", line.getId()); + Assert.assertEquals("Händen", line.getForm()); + Assert.assertEquals("Hand", line.getLemma()); + Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U)); + Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X)); + Assert.assertEquals("Case=Dat|Number=Plur", line.getFeats()); + Assert.assertEquals("5", line.getHead()); + Assert.assertEquals("nmod", line.getDeprel()); + Assert.assertEquals("_", line.getDeps()); + Assert.assertEquals("_", line.getMisc()); + } +}