OPENNLP-1075 Add streams for sentence and token samples for conllu

Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5bf5366e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5bf5366e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5bf5366e

Branch: refs/heads/LangDetect
Commit: 5bf5366e2d5eca700d33d5882b65a5795cb3d656
Parents: d378c06
Author: Jörn Kottmann <[email protected]>
Authored: Tue May 23 17:28:33 2017 +0200
Committer: Jörn Kottmann <[email protected]>
Committed: Wed May 24 16:29:51 2017 +0200

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |  4 ++
 .../conllu/ConlluLemmaSampleStreamFactory.java  |  5 +-
 .../tools/formats/conllu/ConlluSentence.java    | 15 +++-
 .../conllu/ConlluSentenceSampleStream.java      | 59 +++++++++++++++
 .../ConlluSentenceSampleStreamFactory.java      | 65 +++++++++++++++++
 .../tools/formats/conllu/ConlluStream.java      | 30 +++++++-
 .../formats/conllu/ConlluTokenSampleStream.java | 75 ++++++++++++++++++++
 .../conllu/ConlluTokenSampleStreamFactory.java  | 61 ++++++++++++++++
 .../conllu/ConlluSentenceSampleStreamTest.java  | 69 ++++++++++++++++++
 .../tools/formats/conllu/ConlluStreamTest.java  | 56 +++++++++++++++
 .../conllu/ConlluTokenSampleStreamTest.java     | 53 ++++++++++++++
 .../formats/conllu/ConlluWordLineTest.java      |  4 +-
 .../formats/conllu/de-ud-train-sample.conllu    | 30 ++++++++
 13 files changed, 517 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 9977519..2cff212 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -44,6 +44,8 @@ import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
 import opennlp.tools.formats.brat.BratNameSampleStreamFactory;
 import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory;
 import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluSentenceSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory;
@@ -113,6 +115,8 @@ public final class StreamFactoryRegistry {
     LetsmtSentenceStreamFactory.registerFactory();
     MosesSentenceSampleStreamFactory.registerFactory();
 
+    ConlluTokenSampleStreamFactory.registerFactory();
+    ConlluSentenceSampleStreamFactory.registerFactory();
     ConlluPOSSampleStreamFactory.registerFactory();
     ConlluLemmaSampleStreamFactory.registerFactory();
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
index 4806967..3204d7e 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -34,8 +34,6 @@ import opennlp.tools.util.ObjectStream;
  */
 public class ConlluLemmaSampleStreamFactory extends 
AbstractSampleStreamFactory<LemmaSample> {
 
-  public static final String CONLLU_FORMAT = "conllu";
-
   interface Parameters extends BasicFormatParams {
     @ArgumentParser.ParameterDescription(valueName = "tagset",
         description = "u|x u for unified tags and x for language-specific 
part-of-speech tags")
@@ -45,7 +43,8 @@ public class ConlluLemmaSampleStreamFactory extends 
AbstractSampleStreamFactory<
 
   public static void registerFactory() {
     StreamFactoryRegistry.registerFactory(LemmaSample.class,
-        CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class));
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluLemmaSampleStreamFactory(Parameters.class));
   }
 
   protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
index 5d92d89..bbd2b96 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -23,11 +23,24 @@ public class ConlluSentence {
 
   private List<ConlluWordLine> wordLines;
 
-  ConlluSentence(List<ConlluWordLine> wordLines) {
+  private String sentenceIdComment;
+  private String textComment;
+
+  ConlluSentence(List<ConlluWordLine> wordLines, String sentenceIdComment, 
String textComment) {
     this.wordLines = wordLines;
+    this.sentenceIdComment = sentenceIdComment;
+    this.textComment = textComment;
   }
 
   public List<ConlluWordLine> getWordLines() {
     return wordLines;
   }
+
+  public String getSentenceIdComment() {
+    return sentenceIdComment;
+  }
+
+  public String getTextComment() {
+    return textComment;
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
new file mode 100644
index 0000000..f49e205
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStream extends 
FilterObjectStream<ConlluSentence, SentenceSample> {
+
+  private final int sentencesPerSample;
+
+  public ConlluSentenceSampleStream(ObjectStream<ConlluSentence> samples, int 
sentencesPerSample) {
+    super(samples);
+    this.sentencesPerSample = sentencesPerSample;
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+    StringBuilder documentText = new StringBuilder();
+
+    List<Span> sentenceSpans = new ArrayList<>();
+
+    ConlluSentence sentence;
+    for (int i = 0; i <  sentencesPerSample && (sentence = samples.read()) != 
null; i++) {
+
+      int startIndex = documentText.length();
+      documentText.append(sentence.getTextComment()).append(' ');
+      sentenceSpans.add(new Span(startIndex, documentText.length() - 1));
+    }
+
+    if (documentText.length() > 0) {
+      documentText.setLength(documentText.length() - 1);
+      return new SentenceSample(documentText, sentenceSpans.toArray(new 
Span[sentenceSpans.size()]));
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..000af27
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluSentenceSampleStreamFactory extends 
AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new 
ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> ConlluSentenceSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluSentenceSampleStream(new ConlluStream(inFactory),
+          Integer.parseInt(params.getSentencesPerSample()));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index 873a9ed..cbac450 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -49,15 +49,39 @@ public class ConlluStream implements 
ObjectStream<ConlluSentence> {
 
       BufferedReader reader = new BufferedReader(new StringReader(sentence));
 
+      String sentenceId = null;
+      String text = null;
+
       String line;
       while ((line = reader.readLine())  != null) {
-        // # indicates a comment line and should be skipped
-        if (!line.trim().startsWith("#")) {
+        // # indicates a comment line and contains additional data
+        if (line.trim().startsWith("#")) {
+          String commentLine = line.trim().substring(1);
+
+          int separator = commentLine.indexOf('=');
+
+          if (separator != -1) {
+            String firstPart = commentLine.substring(0, separator).trim();
+            String secondPart = commentLine.substring(separator + 1, 
commentLine.length()).trim();
+
+            if (!secondPart.isEmpty()) {
+              switch (firstPart) {
+                case "sent_id":
+                  sentenceId = secondPart;
+                  break;
+                case "text":
+                  text = secondPart;
+                  break;
+              }
+            }
+          }
+        }
+        else {
           wordLines.add(new ConlluWordLine(line));
         }
       }
 
-      return new ConlluSentence(wordLines);
+      return new ConlluSentence(wordLines, sentenceId, text);
     }
 
     return null;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
new file mode 100644
index 0000000..a9ad937
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+public class ConlluTokenSampleStream extends 
FilterObjectStream<ConlluSentence, TokenSample> {
+
+  public ConlluTokenSampleStream(ObjectStream<ConlluSentence> samples) {
+    super(samples);
+  }
+
+  @Override
+  public TokenSample read() throws IOException {
+    ConlluSentence sentence = samples.read();
+    if (sentence != null) {
+      if (sentence.getTextComment() != null) {
+        StringBuilder text = new StringBuilder(sentence.getTextComment());
+        int searchIndex = 0;
+
+        for (ConlluWordLine wordLine : sentence.getWordLines()) {
+
+          // skip over inserted words which are not in the source text
+          if (wordLine.getId().contains(".")) {
+            continue;
+          }
+
+          String token = wordLine.getForm();
+          int tokenIndex = text.indexOf(token, searchIndex);
+
+          if (tokenIndex == -1) {
+            throw new IOException(String.format("Failed to match token [%s] in 
sentence [%s] with text [%s]",
+                token, sentence.getSentenceIdComment(), text));
+          }
+
+          int charAfterTokenIndex = tokenIndex + token.length();
+          if (charAfterTokenIndex < text.length()) {
+            if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) {
+              text.insert(charAfterTokenIndex,
+                  TokenSample.DEFAULT_SEPARATOR_CHARS);
+              searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length();
+            }
+
+            searchIndex += token.length();
+          }
+        }
+        return TokenSample.parse(text.toString(), 
TokenSample.DEFAULT_SEPARATOR_CHARS);
+      }
+      else {
+        throw new IOException("Sentence is missing raw text sample!");
+      }
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
new file mode 100644
index 0000000..5db0407
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamFactory extends 
AbstractSampleStreamFactory<TokenSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(TokenSample.class,
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new 
ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> ConlluTokenSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<TokenSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluTokenSampleStream(new ConlluStream(inFactory));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
new file mode 100644
index 0000000..d45d38f
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"de-ud-train-sample.conllu");
+
+    try (ObjectStream<SentenceSample> stream =
+             new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 
1)) {
+
+      SentenceSample sample1 = stream.read();
+
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein 
freundliches Team.",
+          sample1.getDocument());
+
+      Assert.assertEquals(new Span(0, 65), sample1.getSentences()[0]);
+
+      SentenceSample sample2 = stream.read();
+
+      Assert.assertEquals("Beiden Zahnärzten verdanke ich einen neuen Biss 
und dadurch " +
+          "endlich keine Rückenschmerzen mehr.", sample2.getDocument());
+      Assert.assertEquals(new Span(0, 95), sample2.getSentences()[0]);
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+
+    try (ObjectStream<SentenceSample> stream =
+             new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 
3)) {
+      SentenceSample sample = stream.read();
+
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein 
freundliches Team."
+           + " Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch 
endlich keine "
+           + "Rückenschmerzen mehr.",
+          sample.getDocument());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
new file mode 100644
index 0000000..63968a1
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"de-ud-train-sample.conllu");
+
+    try (ObjectStream<ConlluSentence> stream = new 
ConlluStream(streamFactory)) {
+      ConlluSentence sent1 = stream.read();
+
+      Assert.assertEquals("train-s21", sent1.getSentenceIdComment());
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein 
freundliches Team.",
+          sent1.getTextComment());
+      Assert.assertEquals(11, sent1.getWordLines().size());
+
+      ConlluSentence sent2 = stream.read();
+
+      Assert.assertEquals("train-s22", sent2.getSentenceIdComment());
+      Assert.assertEquals(
+          "Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch 
endlich keine Rückenschmerzen mehr.",
+          sent2.getTextComment());
+      Assert.assertEquals(14, sent2.getWordLines().size());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
new file mode 100644
index 0000000..62cb9a6
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"de-ud-train-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new 
ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS
+          + ", sehr gute Beratung und ein freundliches Team" + 
TokenSample.DEFAULT_SEPARATOR_CHARS
+          + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+      Assert.assertEquals(expected1, stream.read());
+
+      TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke 
ich einen " +
+          "neuen Biss und dadurch endlich keine Rückenschmerzen mehr"
+          + TokenSample.DEFAULT_SEPARATOR_CHARS + ".", 
TokenSample.DEFAULT_SEPARATOR_CHARS);
+      Assert.assertEquals(expected2, stream.read());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
index 4676f6f..005ec55 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
@@ -27,10 +27,10 @@ public class ConlluWordLineTest {
   @Test
   public void testParseLine() throws InvalidFormatException {
     ConlluWordLine line = new ConlluWordLine(
-        "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
+        "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
 
     Assert.assertEquals("12", line.getId());
-    Assert.assertEquals("Händen", line.getForm());
+    Assert.assertEquals("Händen", line.getForm());
     Assert.assertEquals("Hand", line.getLemma());
     Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U));
     Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X));

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
new file mode 100644
index 0000000..13c19da
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
@@ -0,0 +1,30 @@
+# sent_id = train-s21
+# text = Fachlich kompetent, sehr gute Beratung und ein freundliches Team.
+1      Fachlich        fachlich        ADV     ADJD    _       2       advmod  
_       _
+2      kompetent       kompetent       ADJ     ADJD    Degree=Pos      0       
root    _       SpaceAfter=No
+3      ,       ,       PUNCT   $,      _       2       punct   _       _
+4      sehr    sehr    ADV     ADV     _       5       advmod  _       _
+5      gute    gut     ADJ     ADJA    Degree=Pos      6       amod    _       
_
+6      Beratung        Beratung        NOUN    NN      _       2       
parataxis       _       _
+7      und     und     CCONJ   KON     _       10      cc      _       _
+8      ein     ein     DET     ART     Definite=Ind|PronType=Art       10      
det     _       _
+9      freundliches    freundlich      ADJ     ADJA    Degree=Pos      10      
amod    _       _
+10     Team    Team    NOUN    NN      _       6       conj    _       
SpaceAfter=No
+11     .       .       PUNCT   $.      _       2       punct   _       _
+
+# sent_id = train-s22
+# text = Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich 
keine Rückenschmerzen mehr.
+1      Beiden  beide   PRON    PIAT    
Case=Dat|Number=Plur|NumType=Card|PronType=Tot  2       det     _       _
+2      Zahnärzten     Zahnarzt        NOUN    NN      Case=Dat|Number=Plur    
3       iobj    _       _
+3      verdanke        verdanken       VERB    VVFIN   
Number=Sing|Person=1|VerbForm=Fin       0       root    _       _
+4      ich     ich     PRON    PPER    
Case=Nom|Number=Sing|Person=1|PronType=Prs      3       nsubj   _       _
+5      einen   ein     DET     ART     
Case=Acc|Definite=Ind|Number=Plur|PronType=Art  7       det     _       _
+6      neuen   neu     ADJ     ADJA    Case=Acc|Degree=Pos|Number=Plur 7       
amod    _       _
+7      Biss    Biß    NOUN    NN      Case=Acc|Number=Plur    3       obj     
_       _
+8      und     und     CCONJ   KON     _       12      cc      _       _
+9      dadurch dadurch ADV     PAV     _       7       advmod  _       _
+10     endlich endlich ADV     ADV     _       12      advmod  _       _
+11     keine   kein    PRON    PIAT    PronType=Neg    12      advmod  _       
_
+12     Rückenschmerzen        Rückenschmerz  NOUN    NN      _       7       
conj    _       _
+13     mehr    mehr    ADV     ADV     _       12      advmod  _       
SpaceAfter=No
+14     .       .       PUNCT   $.      _       3       punct   _       _
\ No newline at end of file

Reply via email to