OPENNLP-1075 Add streams for sentence and token samples for conllu
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5bf5366e Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5bf5366e Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5bf5366e Branch: refs/heads/LangDetect Commit: 5bf5366e2d5eca700d33d5882b65a5795cb3d656 Parents: d378c06 Author: Jörn Kottmann <[email protected]> Authored: Tue May 23 17:28:33 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Wed May 24 16:29:51 2017 +0200 ---------------------------------------------------------------------- .../tools/cmdline/StreamFactoryRegistry.java | 4 ++ .../conllu/ConlluLemmaSampleStreamFactory.java | 5 +- .../tools/formats/conllu/ConlluSentence.java | 15 +++- .../conllu/ConlluSentenceSampleStream.java | 59 +++++++++++++++ .../ConlluSentenceSampleStreamFactory.java | 65 +++++++++++++++++ .../tools/formats/conllu/ConlluStream.java | 30 +++++++- .../formats/conllu/ConlluTokenSampleStream.java | 75 ++++++++++++++++++++ .../conllu/ConlluTokenSampleStreamFactory.java | 61 ++++++++++++++++ .../conllu/ConlluSentenceSampleStreamTest.java | 69 ++++++++++++++++++ .../tools/formats/conllu/ConlluStreamTest.java | 56 +++++++++++++++ .../conllu/ConlluTokenSampleStreamTest.java | 53 ++++++++++++++ .../formats/conllu/ConlluWordLineTest.java | 4 +- .../formats/conllu/de-ud-train-sample.conllu | 30 ++++++++ 13 files changed, 517 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index 9977519..2cff212 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -44,6 +44,8 @@ import opennlp.tools.formats.ad.ADTokenSampleStreamFactory; import opennlp.tools.formats.brat.BratNameSampleStreamFactory; import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory; import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory; +import opennlp.tools.formats.conllu.ConlluSentenceSampleStreamFactory; +import opennlp.tools.formats.conllu.ConlluTokenSampleStreamFactory; import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory; import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory; import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory; @@ -113,6 +115,8 @@ public final class StreamFactoryRegistry { LetsmtSentenceStreamFactory.registerFactory(); MosesSentenceSampleStreamFactory.registerFactory(); + ConlluTokenSampleStreamFactory.registerFactory(); + ConlluSentenceSampleStreamFactory.registerFactory(); ConlluPOSSampleStreamFactory.registerFactory(); ConlluLemmaSampleStreamFactory.registerFactory(); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java index 4806967..3204d7e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java @@ -34,8 +34,6 @@ import opennlp.tools.util.ObjectStream; */ public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> { - public static final String CONLLU_FORMAT = "conllu"; - interface Parameters extends BasicFormatParams { @ArgumentParser.ParameterDescription(valueName = "tagset", description = "u|x u for unified tags and x for language-specific part-of-speech tags") @@ -45,7 +43,8 @@ public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory< public static void registerFactory() { StreamFactoryRegistry.registerFactory(LemmaSample.class, - CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class)); + ConlluPOSSampleStreamFactory.CONLLU_FORMAT, + new ConlluLemmaSampleStreamFactory(Parameters.class)); } protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java index 5d92d89..bbd2b96 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java @@ -23,11 +23,24 @@ public class ConlluSentence { private List<ConlluWordLine> wordLines; - ConlluSentence(List<ConlluWordLine> wordLines) { + private String sentenceIdComment; + private String textComment; + + ConlluSentence(List<ConlluWordLine> wordLines, String sentenceIdComment, String textComment) { this.wordLines = wordLines; + this.sentenceIdComment = sentenceIdComment; + this.textComment = textComment; } public List<ConlluWordLine> getWordLines() { return wordLines; } + + public String getSentenceIdComment() { + return sentenceIdComment; + } + + public String getTextComment() { + return textComment; + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java new file mode 100644 index 0000000..f49e205 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +public class ConlluSentenceSampleStream extends FilterObjectStream<ConlluSentence, SentenceSample> { + + private final int sentencesPerSample; + + public ConlluSentenceSampleStream(ObjectStream<ConlluSentence> samples, int sentencesPerSample) { + super(samples); + this.sentencesPerSample = sentencesPerSample; + } + + @Override + public SentenceSample read() throws IOException { + StringBuilder documentText = new StringBuilder(); + + List<Span> sentenceSpans = new ArrayList<>(); + + ConlluSentence sentence; + for (int i = 0; i < sentencesPerSample && (sentence = samples.read()) != null; i++) { + + int startIndex = documentText.length(); + documentText.append(sentence.getTextComment()).append(' '); + sentenceSpans.add(new Span(startIndex, documentText.length() - 1)); + } + + if (documentText.length() > 0) { + documentText.setLength(documentText.length() - 1); + return new SentenceSample(documentText, sentenceSpans.toArray(new Span[sentenceSpans.size()])); + } + + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java new file mode 100644 index 0000000..000af27 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +public class ConlluSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> { + + interface Parameters extends BasicFormatParams { + @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample", + description = "number of sentences per sample") + String getSentencesPerSample(); + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(SentenceSample.class, + ConlluPOSSampleStreamFactory.CONLLU_FORMAT, + new ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class)); + } + + protected <P> ConlluSentenceSampleStreamFactory(Class<P> params) { + super(params); + } + + @Override + public ObjectStream<SentenceSample> create(String[] args) { + Parameters params = ArgumentParser.parse(args, Parameters.class); + + InputStreamFactory inFactory = + CmdLineUtil.createInputStreamFactory(params.getData()); + + try { + return new ConlluSentenceSampleStream(new ConlluStream(inFactory), + Integer.parseInt(params.getSentencesPerSample())); + } catch (IOException e) { + // That will throw an exception + CmdLineUtil.handleCreateObjectStreamError(e); + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java index 873a9ed..cbac450 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java @@ -49,15 +49,39 @@ public class ConlluStream implements ObjectStream<ConlluSentence> { BufferedReader reader = new BufferedReader(new StringReader(sentence)); + String sentenceId = null; + String text = null; + String line; while ((line = reader.readLine()) != null) { - // # indicates a comment line and should be skipped - if (!line.trim().startsWith("#")) { + // # indicates a comment line and contains additional data + if (line.trim().startsWith("#")) { + String commentLine = line.trim().substring(1); + + int separator = commentLine.indexOf('='); + + if (separator != -1) { + String firstPart = commentLine.substring(0, separator).trim(); + String secondPart = commentLine.substring(separator + 1, commentLine.length()).trim(); + + if (!secondPart.isEmpty()) { + switch (firstPart) { + case "sent_id": + sentenceId = secondPart; + break; + case "text": + text = secondPart; + break; + } + } + } + } + else { wordLines.add(new ConlluWordLine(line)); } } - return new ConlluSentence(wordLines); + return new ConlluSentence(wordLines, sentenceId, text); } return null; http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java new file mode 100644 index 0000000..a9ad937 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.StringUtil; + +public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence, TokenSample> { + + public ConlluTokenSampleStream(ObjectStream<ConlluSentence> samples) { + super(samples); + } + + @Override + public TokenSample read() throws IOException { + ConlluSentence sentence = samples.read(); + if (sentence != null) { + if (sentence.getTextComment() != null) { + StringBuilder text = new StringBuilder(sentence.getTextComment()); + int searchIndex = 0; + + for (ConlluWordLine wordLine : sentence.getWordLines()) { + + // skip over inserted words which are not in the source text + if (wordLine.getId().contains(".")) { + continue; + } + + String token = wordLine.getForm(); + int tokenIndex = text.indexOf(token, searchIndex); + + if (tokenIndex == -1) { + throw new IOException(String.format("Failed to match token [%s] in sentence [%s] with text [%s]", + token, sentence.getSentenceIdComment(), text)); + } + + int charAfterTokenIndex = tokenIndex + token.length(); + if (charAfterTokenIndex < text.length()) { + if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) { + text.insert(charAfterTokenIndex, + TokenSample.DEFAULT_SEPARATOR_CHARS); + searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length(); + } + + searchIndex += token.length(); + } + } + return TokenSample.parse(text.toString(), TokenSample.DEFAULT_SEPARATOR_CHARS); + } + else { + throw new IOException("Sentence is missing raw text sample!"); + } + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java new file mode 100644 index 0000000..5db0407 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +public class ConlluTokenSampleStreamFactory extends AbstractSampleStreamFactory<TokenSample> { + + interface Parameters extends BasicFormatParams { + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(TokenSample.class, + ConlluPOSSampleStreamFactory.CONLLU_FORMAT, + new ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class)); + } + + protected <P> ConlluTokenSampleStreamFactory(Class<P> params) { + super(params); + } + + @Override + public ObjectStream<TokenSample> create(String[] args) { + Parameters params = ArgumentParser.parse(args, Parameters.class); + + InputStreamFactory inFactory = + CmdLineUtil.createInputStreamFactory(params.getData()); + + try { + return new ConlluTokenSampleStream(new ConlluStream(inFactory)); + } catch (IOException e) { + // That will throw an exception + CmdLineUtil.handleCreateObjectStreamError(e); + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java new file mode 100644 index 0000000..d45d38f --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +public class ConlluSentenceSampleStreamTest { + + @Test + public void testParseTwoSentences() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu"); + + try (ObjectStream<SentenceSample> stream = + new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 1)) { + + SentenceSample sample1 = stream.read(); + + Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.", + sample1.getDocument()); + + Assert.assertEquals(new Span(0, 65), sample1.getSentences()[0]); + + SentenceSample sample2 = stream.read(); + + Assert.assertEquals("Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch " + + "endlich keine Rückenschmerzen mehr.", sample2.getDocument()); + Assert.assertEquals(new Span(0, 95), sample2.getSentences()[0]); + + Assert.assertNull("Stream must be exhausted", stream.read()); + } + + try (ObjectStream<SentenceSample> stream = + new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 3)) { + SentenceSample sample = stream.read(); + + Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team." + + " Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine " + + "Rückenschmerzen mehr.", + sample.getDocument()); + + Assert.assertNull("Stream must be exhausted", stream.read()); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java new file mode 100644 index 0000000..63968a1 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +public class ConlluStreamTest { + + @Test + public void testParseTwoSentences() throws IOException { + + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu"); + + try (ObjectStream<ConlluSentence> stream = new ConlluStream(streamFactory)) { + ConlluSentence sent1 = stream.read(); + + Assert.assertEquals("train-s21", sent1.getSentenceIdComment()); + Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.", + sent1.getTextComment()); + Assert.assertEquals(11, sent1.getWordLines().size()); + + ConlluSentence sent2 = stream.read(); + + Assert.assertEquals("train-s22", sent2.getSentenceIdComment()); + Assert.assertEquals( + "Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.", + sent2.getTextComment()); + Assert.assertEquals(14, sent2.getWordLines().size()); + + Assert.assertNull("Stream must be exhausted", stream.read()); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java new file mode 100644 index 0000000..62cb9a6 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +public class ConlluTokenSampleStreamTest { + + @Test + public void testParseTwoSentences() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu"); + + try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) { + + TokenSample expected1 = TokenSample.parse( + "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS + + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS + + ".", TokenSample.DEFAULT_SEPARATOR_CHARS); + Assert.assertEquals(expected1, stream.read()); + + TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke ich einen " + + "neuen Biss und dadurch endlich keine Rückenschmerzen mehr" + + TokenSample.DEFAULT_SEPARATOR_CHARS + ".", TokenSample.DEFAULT_SEPARATOR_CHARS); + Assert.assertEquals(expected2, stream.read()); + + Assert.assertNull("Stream must be exhausted", stream.read()); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java index 4676f6f..005ec55 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java @@ -27,10 +27,10 @@ public class ConlluWordLineTest { @Test public void testParseLine() throws InvalidFormatException { ConlluWordLine line = new ConlluWordLine( - "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_"); + "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_"); Assert.assertEquals("12", line.getId()); - Assert.assertEquals("Händen", line.getForm()); + Assert.assertEquals("Händen", line.getForm()); Assert.assertEquals("Hand", line.getLemma()); Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U)); Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X)); http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu new file mode 100644 index 0000000..13c19da --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu @@ -0,0 +1,30 @@ +# sent_id = train-s21 +# text = Fachlich kompetent, sehr gute Beratung und ein freundliches Team. +1 Fachlich fachlich ADV ADJD _ 2 advmod _ _ +2 kompetent kompetent ADJ ADJD Degree=Pos 0 root _ SpaceAfter=No +3 , , PUNCT $, _ 2 punct _ _ +4 sehr sehr ADV ADV _ 5 advmod _ _ +5 gute gut ADJ ADJA Degree=Pos 6 amod _ _ +6 Beratung Beratung NOUN NN _ 2 parataxis _ _ +7 und und CCONJ KON _ 10 cc _ _ +8 ein ein DET ART Definite=Ind|PronType=Art 10 det _ _ +9 freundliches freundlich ADJ ADJA Degree=Pos 10 amod _ _ +10 Team Team NOUN NN _ 6 conj _ SpaceAfter=No +11 . . PUNCT $. _ 2 punct _ _ + +# sent_id = train-s22 +# text = Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr. +1 Beiden beide PRON PIAT Case=Dat|Number=Plur|NumType=Card|PronType=Tot 2 det _ _ +2 Zahnärzten Zahnarzt NOUN NN Case=Dat|Number=Plur 3 iobj _ _ +3 verdanke verdanken VERB VVFIN Number=Sing|Person=1|VerbForm=Fin 0 root _ _ +4 ich ich PRON PPER Case=Nom|Number=Sing|Person=1|PronType=Prs 3 nsubj _ _ +5 einen ein DET ART Case=Acc|Definite=Ind|Number=Plur|PronType=Art 7 det _ _ +6 neuen neu ADJ ADJA Case=Acc|Degree=Pos|Number=Plur 7 amod _ _ +7 Biss Bià NOUN NN Case=Acc|Number=Plur 3 obj _ _ +8 und und CCONJ KON _ 12 cc _ _ +9 dadurch dadurch ADV PAV _ 7 advmod _ _ +10 endlich endlich ADV ADV _ 12 advmod _ _ +11 keine kein PRON PIAT PronType=Neg 12 advmod _ _ +12 Rückenschmerzen Rückenschmerz NOUN NN _ 7 conj _ _ +13 mehr mehr ADV ADV _ 12 advmod _ SpaceAfter=No +14 . . PUNCT $. _ 3 punct _ _ \ No newline at end of file
