Repository: opennlp
Updated Branches:
  refs/heads/master f418eed30 -> cc173c2e4


OPENNLP-1083: Conll-U Sample contraction handling

closes apache/opennlp#222


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/cc173c2e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/cc173c2e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/cc173c2e

Branch: refs/heads/master
Commit: cc173c2e4d47d6ee49b4b6050a0fea779d691429
Parents: f418eed
Author: William D C M SILVA <[email protected]>
Authored: Tue May 30 12:56:20 2017 -0300
Committer: William D C M SILVA <[email protected]>
Committed: Tue May 30 12:56:20 2017 -0300

----------------------------------------------------------------------
 .../tools/formats/conllu/ConlluStream.java      | 86 ++++++++++++++++++++
 .../formats/conllu/ConlluTokenSampleStream.java | 11 +--
 .../tools/formats/conllu/ConlluWordLine.java    | 14 ++++
 .../conllu/ConlluLemmaSampleStreamTest.java     | 49 +++++++++++
 .../conllu/ConlluPOSSampleStreamTest.java       | 77 ++++++++++++++++++
 .../conllu/ConlluTokenSampleStreamTest.java     | 51 +++++++++++-
 .../tools/formats/conllu/es-ud-sample.conllu    | 62 ++++++++++++++
 .../tools/formats/conllu/pt_br-ud-sample.conllu | 76 +++++++++++++++++
 8 files changed, 417 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index cbac450..4dd204f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -22,7 +22,10 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
 
 import opennlp.tools.util.InputStreamFactory;
 import opennlp.tools.util.ObjectStream;
@@ -81,12 +84,95 @@ public class ConlluStream implements 
ObjectStream<ConlluSentence> {
         }
       }
 
+      wordLines = postProcessContractions(wordLines);
+
       return new ConlluSentence(wordLines, sentenceId, text);
     }
 
     return null;
   }
 
+  private List<ConlluWordLine> postProcessContractions(List<ConlluWordLine> 
lines) {
+
+
+    // 1. Find contractions
+    Map<String, Integer> index = new HashMap();
+    Map<String, List<String>> contractions = new HashMap();
+    List<String> linesToDelete = new ArrayList();
+
+    for (int i = 0; i < lines.size(); i++) {
+      ConlluWordLine line = lines.get(i);
+      index.put(line.getId(), i);
+      if (line.getId().contains("-")) {
+        List<String> expandedContractions = new ArrayList();
+        String[] ids = line.getId().split("-");
+        int start = Integer.parseInt(ids[0]);
+        int end = Integer.parseInt(ids[1]);
+        for (int j = start; j <= end; j++) {
+          String js = Integer.toString(j);
+          expandedContractions.add(js);
+          linesToDelete.add(js);
+        }
+        contractions.put(line.getId(), expandedContractions);
+      }
+    }
+
+    // 2. Merge annotation
+    for (String contractionId: contractions.keySet()) {
+      ConlluWordLine contraction = lines.get(index.get(contractionId));
+      List<ConlluWordLine> expandedParts = new ArrayList();
+      for (String id : contractions.get(contractionId)) {
+        expandedParts.add(lines.get(index.get(id)));
+      }
+      ConlluWordLine merged = mergeAnnotation(contraction, expandedParts);
+      lines.set(index.get(contractionId), merged);
+    }
+
+    // 3. Delete the expanded parts
+    for (int i = linesToDelete.size() - 1; i >= 0; i--) {
+      lines.remove(index.get(linesToDelete.get(i)).intValue());
+    }
+    return lines;
+  }
+
+  /**
+   * Merges token level annotations
+   * @param contraction the line that receives the annotation
+   * @param expandedParts the lines to get annotation
+   * @return the merged line
+   */
+  private ConlluWordLine mergeAnnotation(ConlluWordLine contraction,
+                                         List<ConlluWordLine> expandedParts) {
+    String id = contraction.getId();
+    String form = contraction.getForm();
+    String lemma = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getLemma()))
+        .map(p -> p.getLemma())
+        .collect(Collectors.joining("+"));
+
+    String uPosTag = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getPosTag(ConlluTagset.U)))
+        .map(p -> p.getPosTag(ConlluTagset.U))
+        .collect(Collectors.joining("+"));
+
+    String xPosTag = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getPosTag(ConlluTagset.X)))
+        .map(p -> p.getPosTag(ConlluTagset.X))
+        .collect(Collectors.joining("+"));
+
+    String feats = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getFeats()))
+        .map(p -> p.getFeats())
+        .collect(Collectors.joining("+"));
+
+    String head = contraction.getHead();
+    String deprel = contraction.getDeprel();
+    String deps = contraction.getDeps();
+    String misc = contraction.getMisc();
+
+    return new ConlluWordLine(id, form, lemma, uPosTag, xPosTag, feats,head, 
deprel, deps, misc);
+  }
+
   @Override
   public void close() throws IOException {
     sentenceStream.close();

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
index a9ad937..bc6907b 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -53,15 +53,12 @@ public class ConlluTokenSampleStream extends 
FilterObjectStream<ConlluSentence,
                 token, sentence.getSentenceIdComment(), text));
           }
 
-          int charAfterTokenIndex = tokenIndex + token.length();
-          if (charAfterTokenIndex < text.length()) {
-            if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) {
-              text.insert(charAfterTokenIndex,
+          searchIndex = tokenIndex + token.length();
+          if (searchIndex < text.length()) {
+            if (!StringUtil.isWhitespace(text.charAt(searchIndex))) {
+              text.insert(searchIndex,
                   TokenSample.DEFAULT_SEPARATOR_CHARS);
-              searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length();
             }
-
-            searchIndex += token.length();
           }
         }
         return TokenSample.parse(text.toString(), 
TokenSample.DEFAULT_SEPARATOR_CHARS);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
index 9881bf1..4e626be 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
@@ -32,6 +32,20 @@ public class ConlluWordLine {
   private final String deps;
   private final String misc;
 
+  ConlluWordLine(String id, String form, String lemma, String uPosTag, String 
xPosTag,
+                 String feats, String head, String deprel, String deps, String 
misc) {
+    this.id = id;
+    this.form = form;
+    this.lemma = lemma;
+    this.uPosTag = uPosTag;
+    this.xPosTag = xPosTag;
+    this.feats = feats;
+    this.head = head;
+    this.deprel = deprel;
+    this.deps = deps;
+    this.misc = misc;
+  }
+
   ConlluWordLine(String line) throws InvalidFormatException {
 
     String[] fields = line.split("\t");

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
new file mode 100644
index 0000000..5d58cf1
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluLemmaSampleStreamTest {
+
+
+  @Test
+  public void testParseSpanishS300() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"es-ud-sample.conllu");
+
+    try (ObjectStream<LemmaSample> stream = new ConlluLemmaSampleStream(
+        new ConlluStream(streamFactory), ConlluTagset.U)) {
+
+      LemmaSample predicted = stream.read();
+      System.out.println(predicted);
+      Assert.assertEquals("digám+tú+él", predicted.getLemmas()[0]);
+      Assert.assertEquals("la", predicted.getTokens()[3]);
+      Assert.assertEquals("el", predicted.getLemmas()[3]);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
new file mode 100644
index 0000000..f6bef72
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluPOSSampleStreamTest {
+  @Test
+  public void testParseContraction() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"pt_br-ud-sample.conllu");
+
+    try (ObjectStream<POSSample> stream = new ConlluPOSSampleStream(
+        new ConlluStream(streamFactory), ConlluTagset.U)) {
+
+      POSSample expected = POSSample.parse("Numa_ADP+DET reunião_NOUN 
entre_ADP " +
+          "representantes_NOUN da_ADP+DET Secretaria_PROPN da_ADP+DET 
Criança_PROPN do_ADP+DET " +
+          "DF_PROPN ea_CCONJ juíza_NOUN da_ADP+DET Vara_PROPN de_ADP 
Execuções_PROPN de_ADP " +
+          "Medidas_PROPN Socioeducativas_PROPN ,_PUNCT Lavínia_PROPN 
Tupi_PROPN Vieira_PROPN " +
+          "Fonseca_PROPN ,_PUNCT ficou_VERB acordado_ADJ que_CCONJ dos_ADP+DET 
25_NUM " +
+          "internos_NOUN ,_PUNCT 12_NUM serão_AUX internados_VERB na_ADP+DET 
Unidade_PROPN " +
+          "de_ADP Planaltina_PROPN e_CCONJ os_DET outros_DET 13_NUM devem_AUX 
retornar_VERB " +
+          "para_ADP a_DET Unidade_PROPN do_ADP+DET Recanto_NOUN das_ADP+DET 
Emas_PROPN ,_PUNCT " +
+          "antigo_ADJ Ciago_PROPN ._PUNCT");
+
+      POSSample predicted = stream.read();
+      Assert.assertEquals(expected, predicted);
+    }
+  }
+
+
+  @Test
+  public void testParseSpanishS300() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"es-ud-sample.conllu");
+
+    try (ObjectStream<POSSample> stream = new ConlluPOSSampleStream(new 
ConlluStream(streamFactory),
+        ConlluTagset.U)) {
+
+      POSSample expected1 = POSSample.parse(
+          "Digámoslo_VERB+PRON+PRON claramente_ADV ,_PUNCT la_DET 
insurgencia_NOUN se_PRON " +
+              "ha_AUX pronunciado_VERB mucho_PRON más_ADV claramente_ADV 
respecto_NOUN " +
+              "al_ADP+DET tema_NOUN de_ADP la_DET paz_NOUN que_CCONJ el_DET 
Estado_NOUN ,_PUNCT " +
+              "como_SCONJ lo_PRON demuestra_VERB el_DET fragmento_NOUN 
que_SCONJ Bermúdez_PROPN " +
+              "cita_VERB de_ADP la_DET respuesta_NOUN de_ADP \"_PUNCT 
Gabino_PROPN \"_PUNCT " +
+              "a_ADP Piedad_PROPN Córdoba_PROPN ,_PUNCT en_ADP la_DET 
cual_PRON no_ADV se_PRON " +
+              "plantea_VERB ni_CCONJ siquiera_ADV \"_PUNCT esperar_VERB un_DET 
mejor_ADJ " +
+              "gobierno_NOUN \"_PUNCT ._PUNCT");
+      POSSample predicted = stream.read();
+      Assert.assertEquals(expected1, predicted);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
index 62cb9a6..be32a3b 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
@@ -38,8 +38,8 @@ public class ConlluTokenSampleStreamTest {
 
       TokenSample expected1 = TokenSample.parse(
           "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS
-          + ", sehr gute Beratung und ein freundliches Team" + 
TokenSample.DEFAULT_SEPARATOR_CHARS
-          + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+              + ", sehr gute Beratung und ein freundliches Team" + 
TokenSample.DEFAULT_SEPARATOR_CHARS
+              + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
       Assert.assertEquals(expected1, stream.read());
 
       TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke 
ich einen " +
@@ -50,4 +50,51 @@ public class ConlluTokenSampleStreamTest {
       Assert.assertNull("Stream must be exhausted", stream.read());
     }
   }
+
+  @Test
+  public void testParseContraction() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"pt_br-ud-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new 
ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Numa reunião entre representantes da Secretaria da Criança do DF 
" +
+              "ea juíza da Vara de Execuções de Medidas Socioeducativas" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", Lavínia Tupi Vieira 
Fonseca" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", ficou acordado que dos 
25 internos" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", 12 serão internados na 
Unidade de " +
+              "Planaltina e os outros 13 devem retornar para a Unidade do 
Recanto das Emas" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", antigo Ciago" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "."
+          , TokenSample.DEFAULT_SEPARATOR_CHARS);
+      TokenSample predicted = stream.read();
+      Assert.assertEquals(expected1, predicted);
+    }
+  }
+
+  @Test
+  public void testParseSpanishS300() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, 
"es-ud-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new 
ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Digámoslo claramente" + TokenSample.DEFAULT_SEPARATOR_CHARS +
+              ", la insurgencia se ha pronunciado mucho más claramente 
respecto al " +
+              "tema de la paz que el Estado" + 
TokenSample.DEFAULT_SEPARATOR_CHARS +
+              ", como lo demuestra el fragmento que Bermúdez cita de la 
respuesta de \"" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "Gabino" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "\" a Piedad Córdoba" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", en la cual no se 
plantea ni siquiera \"" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "esperar un mejor 
gobierno" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "\"" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "."
+
+          , TokenSample.DEFAULT_SEPARATOR_CHARS);
+      TokenSample predicted = stream.read();
+      Assert.assertEquals(expected1, predicted);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
new file mode 100644
index 0000000..e30c52b
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
@@ -0,0 +1,62 @@
+# sent_id = es-train-001-s300
+# text = Digámoslo claramente, la insurgencia se ha pronunciado mucho más 
claramente respecto al tema de la paz que el Estado, como lo demuestra el 
fragmento que Bermúdez cita de la respuesta de "Gabino" a Piedad Córdoba, en 
la cual no se plantea ni siquiera "esperar un mejor gobierno".
+1-3    Digámoslo      _       _       _       _       _       _       _       
_
+1      Digám  digám  VERB    _       VerbForm=Fin    0       root    _       
_
+2      os      tú     PRON    _       
Case=Acc,Dat|Number=Plur|Person=2|PrepCase=Npr|PronType=Prs     1       iobj    
_       _
+3      lo      él     PRON    _       
Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs     1       
obj     _       _
+4      claramente      claramente      ADV     _       _       1       advmod  
_       SpaceAfter=No
+5      ,       ,       PUNCT   _       _       1       punct   _       _
+6      la      el      DET     _       
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        7       det     _       
_
+7      insurgencia     insurgencia     NOUN    _       Gender=Fem|Number=Sing  
10      nsubj   _       _
+8      se      él     PRON    _       
Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes      10      iobj    
_       _
+9      ha      haber   AUX     _       
Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   10      aux     _       
_
+10     pronunciado     pronunciar      VERB    _       
Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part        1       parataxis       
_       _
+11     mucho   mucho   PRON    _       NumType=Card|PronType=Ind       12      
nmod    _       _
+12     más    más    ADV     _       Degree=Cmp      13      advmod  _       
_
+13     claramente      claramente      ADV     _       _       10      advmod  
_       _
+14     respecto        respecto        NOUN    _       Gender=Masc|Number=Sing 
17      nmod    _       _
+15-16  al      _       _       _       _       _       _       _       _
+15     a       a       ADP     _       _       14      fixed   _       _
+16     el      el      DET     _       
Definite=Def|Gender=Masc|Number=Sing|PronType=Art       14      det     _       
_
+17     tema    tema    NOUN    _       Gender=Masc|Number=Sing 10      obl     
_       _
+18     de      de      ADP     _       _       20      case    _       _
+19     la      el      DET     _       
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        20      det     _       
_
+20     paz     paz     NOUN    _       Gender=Fem|Number=Sing  17      nmod    
_       _
+21     que     que     CCONJ   _       _       23      case    _       _
+22     el      el      DET     _       
Definite=Def|Gender=Masc|Number=Sing|PronType=Art       23      det     _       
_
+23     Estado  estado  NOUN    _       _       12      nmod    _       
SpaceAfter=No
+24     ,       ,       PUNCT   _       _       27      punct   _       _
+25     como    como    SCONJ   _       _       27      mark    _       _
+26     lo      él     PRON    _       
Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs     27      
obj     _       _
+27     demuestra       demostrar       VERB    _       
Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   10      advcl   _       
_
+28     el      el      DET     _       
Definite=Def|Gender=Masc|Number=Sing|PronType=Art       29      det     _       
_
+29     fragmento       fragmento       NOUN    _       Gender=Masc|Number=Sing 
27      nsubj   _       _
+30     que     que     SCONJ   _       _       32      mark    _       _
+31     Bermúdez       bermúdez       PROPN   _       _       32      nsubj   
_       _
+32     cita    cita    VERB    _       
Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   29      acl:relcl       
_       _
+33     de      de      ADP     _       _       35      case    _       _
+34     la      el      DET     _       
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        35      det     _       
_
+35     respuesta       respuesta       NOUN    _       Gender=Fem|Number=Sing  
29      nmod    _       _
+36     de      de      ADP     _       _       38      case    _       _
+37     "       "       PUNCT   _       _       38      punct   _       
SpaceAfter=No
+38     Gabino  gabino  PROPN   _       _       35      nmod    _       
SpaceAfter=No
+39     "       "       PUNCT   _       _       38      punct   _       _
+40     a       a       ADP     _       _       41      case    _       _
+41     Piedad  piedad  PROPN   _       _       35      nmod    _       _
+42     Córdoba        córdoba        PROPN   _       _       41      flat    
_       SpaceAfter=No
+43     ,       ,       PUNCT   _       _       49      punct   _       _
+44     en      en      ADP     _       _       46      case    _       _
+45     la      el      DET     _       
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        46      det     _       
_
+46     cual    cual    PRON    _       Number=Sing|PronType=Int,Rel    49      
mark    _       _
+47     no      no      ADV     _       Polarity=Neg    49      advmod  _       
_
+48     se      él     PRON    _       
Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes      49      iobj    
_       _
+49     plantea plantear        VERB    _       
Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   35      acl:relcl       
_       _
+50     ni      ni      CCONJ   _       Polarity=Neg    53      advmod  _       
_
+51     siquiera        siquiera        ADV     _       _       50      fixed   
_       _
+52     "       "       PUNCT   _       _       53      punct   _       
SpaceAfter=No
+53     esperar esperar VERB    _       VerbForm=Inf    49      csubj   _       
_
+54     un      uno     DET     _       
Definite=Ind|Gender=Masc|Number=Sing|PronType=Art       56      det     _       
_
+55     mejor   mejor   ADJ     _       Degree=Cmp|Number=Sing  56      amod    
_       _
+56     gobierno        gobierno        NOUN    _       Gender=Masc|Number=Sing 
53      obj     _       SpaceAfter=No
+57     "       "       PUNCT   _       _       53      punct   _       
SpaceAfter=No
+58     .       .       PUNCT   _       _       1       punct   _       _

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
new file mode 100644
index 0000000..f616044
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
@@ -0,0 +1,76 @@
+# sent_id = train-s2
+# text = Numa reunião entre representantes da Secretaria da Criança do DF ea 
juíza da Vara de Execuções de Medidas Socioeducativas, Lavínia Tupi Vieira 
Fonseca, ficou acordado que dos 25 internos, 12 serão internados na Unidade de 
Planaltina e os outros 13 devem retornar para a Unidade do Recanto das Emas, 
antigo Ciago.
+1-2    Numa    _       _       _       _       _       _       _       _
+1      Em      _       ADP     ADP     _       3       case    _       _
+2      uma     _       DET     DET     _       3       det     _       _
+3      reunião        _       NOUN    NOUN    _       31      nmod    _       
_
+4      entre   _       ADP     ADP     _       5       case    _       _
+5      representantes  _       NOUN    NOUN    _       3       nmod    _       
_
+6-7    da      _       _       _       _       _       _       _       _
+6      de      de      ADP     ADP     _       8       case    _       _
+7      a       o       DET     DET     
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        8       det     _       
_
+8      Secretaria      _       PROPN   PNOUN   _       5       nmod    _       
_
+9-10   da      _       _       _       _       _       _       _       _
+9      de      de      ADP     ADP     _       11      case    _       _
+10     a       o       DET     DET     
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        11      det     _       
_
+11     Criança        _       PROPN   PNOUN   _       8       nmod    _       
_
+12-13  do      _       _       _       _       _       _       _       _
+12     de      de      ADP     ADP     _       14      case    _       _
+13     o       o       DET     DET     
Definite=Def|Gender=Masc|Number=Sing|PronType=Art       14      det     _       
_
+14     DF      _       PROPN   PNOUN   _       8       nmod    _       _
+15     ea      _       CCONJ   CONJ    _       16      cc      _       _
+16     juíza  _       NOUN    NOUN    _       5       conj    _       _
+17-18  da      _       _       _       _       _       _       _       _
+17     de      de      ADP     ADP     _       19      case    _       _
+18     a       o       DET     DET     
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        19      det     _       
_
+19     Vara    _       PROPN   PNOUN   _       16      nmod    _       _
+20     de      _       ADP     ADP     _       21      case    _       _
+21     Execuções     _       PROPN   PNOUN   _       19      nmod    _       
_
+22     de      _       ADP     ADP     _       23      case    _       _
+23     Medidas _       PROPN   PNOUN   _       21      nmod    _       _
+24     Socioeducativas _       PROPN   PNOUN   _       23      amod    _       
SpaceAfter=No
+25     ,       _       PUNCT   .       _       26      punct   _       _
+26     Lavínia        _       PROPN   PNOUN   _       16      appos   _       
_
+27     Tupi    _       PROPN   PNOUN   _       26      flat    _       _
+28     Vieira  _       PROPN   PNOUN   _       26      flat    _       _
+29     Fonseca _       PROPN   PNOUN   _       26      flat    _       
SpaceAfter=No
+30     ,       _       PUNCT   .       _       3       punct   _       _
+31     ficou   _       VERB    VERB    _       0       root    _       _
+32     acordado        _       ADJ     ADJ     _       31      xcomp:adj       
_       _
+33     que     _       CCONJ   CONJ    _       41      mark    _       _
+34-35  dos     _       _       _       _       _       _       _       _
+34     de      de      ADP     ADP     _       37      case    _       _
+35     os      o       DET     DET     
Definite=Def|Gender=Masc|Number=Plur|PronType=Art       37      det     _       
_
+36     25      _       NUM     NUM     NumType=Card    37      nummod  _       
_
+37     internos        _       NOUN    NOUN    _       41      nmod    _       
SpaceAfter=No
+38     ,       _       PUNCT   .       _       37      punct   _       _
+39     12      _       NUM     NUM     NumType=Card    41      nsubj:pass      
_       _
+40     serão  _       AUX     AUX     _       41      aux:pass        _       
_
+41     internados      _       VERB    VERB    _       31      csubj   _       
_
+42-43  na      _       _       _       _       _       _       _       _
+42     en      en      ADP     ADP     _       44      case    _       _
+43     a       o       DET     DET     
Definite=Def|Gender=Fem|Number=Sing|PronType=Art        44      det     _       
_
+44     Unidade _       PROPN   PNOUN   _       41      nmod    _       _
+45     de      _       ADP     ADP     _       46      case    _       _
+46     Planaltina      _       PROPN   PNOUN   _       44      nmod    _       
_
+47     e       _       CCONJ   CONJ    _       52      cc      _       _
+48     os      _       DET     DET     _       50      det     _       _
+49     outros  _       DET     DET     _       50      det     _       _
+50     13      _       NUM     NUM     NumType=Card    52      nsubj   _       
_
+51     devem   _       AUX     AUX     _       52      aux     _       _
+52     retornar        _       VERB    VERB    _       41      conj    _       
_
+53     para    _       ADP     ADP     _       55      case    _       _
+54     a       _       DET     DET     _       55      det     _       _
+55     Unidade _       PROPN   PNOUN   _       52      nmod    _       _
+56-57  do      _       _       _       _       _       _       _       _
+56     de      de      ADP     ADP     _       58      case    _       _
+57     o       o       DET     DET     
Definite=Def|Gender=Masc|Number=Sing|PronType=Art       58      det     _       
_
+58     Recanto _       NOUN    NOUN    _       55      nmod    _       _
+59-60  das     _       _       _       _       _       _       _       _
+59     de      de      ADP     ADP     _       61      case    _       _
+60     as      o       DET     DET     
Definite=Def|Gender=Fem|Number=Plur|PronType=Art        61      det     _       
_
+61     Emas    _       PROPN   PNOUN   _       58      nmod    _       
SpaceAfter=No
+62     ,       _       PUNCT   .       _       64      punct   _       _
+63     antigo  _       ADJ     ADJ     _       64      amod    _       _
+64     Ciago   _       PROPN   PNOUN   _       55      appos   _       
SpaceAfter=No
+65     .       _       PUNCT   .       _       31      punct   _       _

Reply via email to