Repository: opennlp Updated Branches: refs/heads/master f418eed30 -> cc173c2e4
OPENNLP-1083: Conll-U Sample contraction handling closes apache/opennlp#222 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/cc173c2e Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/cc173c2e Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/cc173c2e Branch: refs/heads/master Commit: cc173c2e4d47d6ee49b4b6050a0fea779d691429 Parents: f418eed Author: William D C M SILVA <[email protected]> Authored: Tue May 30 12:56:20 2017 -0300 Committer: William D C M SILVA <[email protected]> Committed: Tue May 30 12:56:20 2017 -0300 ---------------------------------------------------------------------- .../tools/formats/conllu/ConlluStream.java | 86 ++++++++++++++++++++ .../formats/conllu/ConlluTokenSampleStream.java | 11 +-- .../tools/formats/conllu/ConlluWordLine.java | 14 ++++ .../conllu/ConlluLemmaSampleStreamTest.java | 49 +++++++++++ .../conllu/ConlluPOSSampleStreamTest.java | 77 ++++++++++++++++++ .../conllu/ConlluTokenSampleStreamTest.java | 51 +++++++++++- .../tools/formats/conllu/es-ud-sample.conllu | 62 ++++++++++++++ .../tools/formats/conllu/pt_br-ud-sample.conllu | 76 +++++++++++++++++ 8 files changed, 417 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java index cbac450..4dd204f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java @@ -22,7 +22,10 @@ import java.io.IOException; import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; @@ -81,12 +84,95 @@ public class ConlluStream implements ObjectStream<ConlluSentence> { } } + wordLines = postProcessContractions(wordLines); + return new ConlluSentence(wordLines, sentenceId, text); } return null; } + private List<ConlluWordLine> postProcessContractions(List<ConlluWordLine> lines) { + + + // 1. Find contractions + Map<String, Integer> index = new HashMap(); + Map<String, List<String>> contractions = new HashMap(); + List<String> linesToDelete = new ArrayList(); + + for (int i = 0; i < lines.size(); i++) { + ConlluWordLine line = lines.get(i); + index.put(line.getId(), i); + if (line.getId().contains("-")) { + List<String> expandedContractions = new ArrayList(); + String[] ids = line.getId().split("-"); + int start = Integer.parseInt(ids[0]); + int end = Integer.parseInt(ids[1]); + for (int j = start; j <= end; j++) { + String js = Integer.toString(j); + expandedContractions.add(js); + linesToDelete.add(js); + } + contractions.put(line.getId(), expandedContractions); + } + } + + // 2. Merge annotation + for (String contractionId: contractions.keySet()) { + ConlluWordLine contraction = lines.get(index.get(contractionId)); + List<ConlluWordLine> expandedParts = new ArrayList(); + for (String id : contractions.get(contractionId)) { + expandedParts.add(lines.get(index.get(id))); + } + ConlluWordLine merged = mergeAnnotation(contraction, expandedParts); + lines.set(index.get(contractionId), merged); + } + + // 3. Delete the expanded parts + for (int i = linesToDelete.size() - 1; i >= 0; i--) { + lines.remove(index.get(linesToDelete.get(i)).intValue()); + } + return lines; + } + + /** + * Merges token level annotations + * @param contraction the line that receives the annotation + * @param expandedParts the lines to get annotation + * @return the merged line + */ + private ConlluWordLine mergeAnnotation(ConlluWordLine contraction, + List<ConlluWordLine> expandedParts) { + String id = contraction.getId(); + String form = contraction.getForm(); + String lemma = expandedParts.stream() + .filter(p -> !"_".equals(p.getLemma())) + .map(p -> p.getLemma()) + .collect(Collectors.joining("+")); + + String uPosTag = expandedParts.stream() + .filter(p -> !"_".equals(p.getPosTag(ConlluTagset.U))) + .map(p -> p.getPosTag(ConlluTagset.U)) + .collect(Collectors.joining("+")); + + String xPosTag = expandedParts.stream() + .filter(p -> !"_".equals(p.getPosTag(ConlluTagset.X))) + .map(p -> p.getPosTag(ConlluTagset.X)) + .collect(Collectors.joining("+")); + + String feats = expandedParts.stream() + .filter(p -> !"_".equals(p.getFeats())) + .map(p -> p.getFeats()) + .collect(Collectors.joining("+")); + + String head = contraction.getHead(); + String deprel = contraction.getDeprel(); + String deps = contraction.getDeps(); + String misc = contraction.getMisc(); + + return new ConlluWordLine(id, form, lemma, uPosTag, xPosTag, feats,head, deprel, deps, misc); + } + @Override public void close() throws IOException { sentenceStream.close(); http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java index a9ad937..bc6907b 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java @@ -53,15 +53,12 @@ public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence, token, sentence.getSentenceIdComment(), text)); } - int charAfterTokenIndex = tokenIndex + token.length(); - if (charAfterTokenIndex < text.length()) { - if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) { - text.insert(charAfterTokenIndex, + searchIndex = tokenIndex + token.length(); + if (searchIndex < text.length()) { + if (!StringUtil.isWhitespace(text.charAt(searchIndex))) { + text.insert(searchIndex, TokenSample.DEFAULT_SEPARATOR_CHARS); - searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length(); } - - searchIndex += token.length(); } } return TokenSample.parse(text.toString(), TokenSample.DEFAULT_SEPARATOR_CHARS); http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java index 9881bf1..4e626be 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java @@ -32,6 +32,20 @@ public class ConlluWordLine { private final String deps; private final String misc; + ConlluWordLine(String id, String form, String lemma, String uPosTag, String xPosTag, + String feats, String head, String deprel, String deps, String misc) { + this.id = id; + this.form = form; + this.lemma = lemma; + this.uPosTag = uPosTag; + this.xPosTag = xPosTag; + this.feats = feats; + this.head = head; + this.deprel = deprel; + this.deps = deps; + this.misc = misc; + } + ConlluWordLine(String line) throws InvalidFormatException { String[] fields = line.split("\t"); http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java new file mode 100644 index 0000000..5d58cf1 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + + +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.lemmatizer.LemmaSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +public class ConlluLemmaSampleStreamTest { + + + @Test + public void testParseSpanishS300() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "es-ud-sample.conllu"); + + try (ObjectStream<LemmaSample> stream = new ConlluLemmaSampleStream( + new ConlluStream(streamFactory), ConlluTagset.U)) { + + LemmaSample predicted = stream.read(); + System.out.println(predicted); + Assert.assertEquals("digám+tú+él", predicted.getLemmas()[0]); + Assert.assertEquals("la", predicted.getTokens()[3]); + Assert.assertEquals("el", predicted.getLemmas()[3]); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java new file mode 100644 index 0000000..f6bef72 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.conllu; + + +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.postag.POSSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; + +public class ConlluPOSSampleStreamTest { + @Test + public void testParseContraction() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "pt_br-ud-sample.conllu"); + + try (ObjectStream<POSSample> stream = new ConlluPOSSampleStream( + new ConlluStream(streamFactory), ConlluTagset.U)) { + + POSSample expected = POSSample.parse("Numa_ADP+DET reunião_NOUN entre_ADP " + + "representantes_NOUN da_ADP+DET Secretaria_PROPN da_ADP+DET Criança_PROPN do_ADP+DET " + + "DF_PROPN ea_CCONJ juÃza_NOUN da_ADP+DET Vara_PROPN de_ADP Execuções_PROPN de_ADP " + + "Medidas_PROPN Socioeducativas_PROPN ,_PUNCT LavÃnia_PROPN Tupi_PROPN Vieira_PROPN " + + "Fonseca_PROPN ,_PUNCT ficou_VERB acordado_ADJ que_CCONJ dos_ADP+DET 25_NUM " + + "internos_NOUN ,_PUNCT 12_NUM serão_AUX internados_VERB na_ADP+DET Unidade_PROPN " + + "de_ADP Planaltina_PROPN e_CCONJ os_DET outros_DET 13_NUM devem_AUX retornar_VERB " + + "para_ADP a_DET Unidade_PROPN do_ADP+DET Recanto_NOUN das_ADP+DET Emas_PROPN ,_PUNCT " + + "antigo_ADJ Ciago_PROPN ._PUNCT"); + + POSSample predicted = stream.read(); + Assert.assertEquals(expected, predicted); + } + } + + + @Test + public void testParseSpanishS300() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "es-ud-sample.conllu"); + + try (ObjectStream<POSSample> stream = new ConlluPOSSampleStream(new ConlluStream(streamFactory), + ConlluTagset.U)) { + + POSSample expected1 = POSSample.parse( + "Digámoslo_VERB+PRON+PRON claramente_ADV ,_PUNCT la_DET insurgencia_NOUN se_PRON " + + "ha_AUX pronunciado_VERB mucho_PRON más_ADV claramente_ADV respecto_NOUN " + + "al_ADP+DET tema_NOUN de_ADP la_DET paz_NOUN que_CCONJ el_DET Estado_NOUN ,_PUNCT " + + "como_SCONJ lo_PRON demuestra_VERB el_DET fragmento_NOUN que_SCONJ Bermúdez_PROPN " + + "cita_VERB de_ADP la_DET respuesta_NOUN de_ADP \"_PUNCT Gabino_PROPN \"_PUNCT " + + "a_ADP Piedad_PROPN Córdoba_PROPN ,_PUNCT en_ADP la_DET cual_PRON no_ADV se_PRON " + + "plantea_VERB ni_CCONJ siquiera_ADV \"_PUNCT esperar_VERB un_DET mejor_ADJ " + + "gobierno_NOUN \"_PUNCT ._PUNCT"); + POSSample predicted = stream.read(); + Assert.assertEquals(expected1, predicted); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java index 62cb9a6..be32a3b 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java @@ -38,8 +38,8 @@ public class ConlluTokenSampleStreamTest { TokenSample expected1 = TokenSample.parse( "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS - + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS - + ".", TokenSample.DEFAULT_SEPARATOR_CHARS); + + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS + + ".", TokenSample.DEFAULT_SEPARATOR_CHARS); Assert.assertEquals(expected1, stream.read()); TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke ich einen " + @@ -50,4 +50,51 @@ public class ConlluTokenSampleStreamTest { Assert.assertNull("Stream must be exhausted", stream.read()); } } + + @Test + public void testParseContraction() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "pt_br-ud-sample.conllu"); + + try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) { + + TokenSample expected1 = TokenSample.parse( + "Numa reunião entre representantes da Secretaria da Criança do DF " + + "ea juÃza da Vara de Execuções de Medidas Socioeducativas" + + TokenSample.DEFAULT_SEPARATOR_CHARS + ", LavÃnia Tupi Vieira Fonseca" + + TokenSample.DEFAULT_SEPARATOR_CHARS + ", ficou acordado que dos 25 internos" + + TokenSample.DEFAULT_SEPARATOR_CHARS + ", 12 serão internados na Unidade de " + + "Planaltina e os outros 13 devem retornar para a Unidade do Recanto das Emas" + + TokenSample.DEFAULT_SEPARATOR_CHARS + ", antigo Ciago" + + TokenSample.DEFAULT_SEPARATOR_CHARS + "." + , TokenSample.DEFAULT_SEPARATOR_CHARS); + TokenSample predicted = stream.read(); + Assert.assertEquals(expected1, predicted); + } + } + + @Test + public void testParseSpanishS300() throws IOException { + InputStreamFactory streamFactory = + new ResourceAsStreamFactory(ConlluStreamTest.class, "es-ud-sample.conllu"); + + try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) { + + TokenSample expected1 = TokenSample.parse( + "Digámoslo claramente" + TokenSample.DEFAULT_SEPARATOR_CHARS + + ", la insurgencia se ha pronunciado mucho más claramente respecto al " + + "tema de la paz que el Estado" + TokenSample.DEFAULT_SEPARATOR_CHARS + + ", como lo demuestra el fragmento que Bermúdez cita de la respuesta de \"" + + TokenSample.DEFAULT_SEPARATOR_CHARS + "Gabino" + + TokenSample.DEFAULT_SEPARATOR_CHARS + "\" a Piedad Córdoba" + + TokenSample.DEFAULT_SEPARATOR_CHARS + ", en la cual no se plantea ni siquiera \"" + + TokenSample.DEFAULT_SEPARATOR_CHARS + "esperar un mejor gobierno" + + TokenSample.DEFAULT_SEPARATOR_CHARS + "\"" + + TokenSample.DEFAULT_SEPARATOR_CHARS + "." + + , TokenSample.DEFAULT_SEPARATOR_CHARS); + TokenSample predicted = stream.read(); + Assert.assertEquals(expected1, predicted); + } + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu new file mode 100644 index 0000000..e30c52b --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu @@ -0,0 +1,62 @@ +# sent_id = es-train-001-s300 +# text = Digámoslo claramente, la insurgencia se ha pronunciado mucho más claramente respecto al tema de la paz que el Estado, como lo demuestra el fragmento que Bermúdez cita de la respuesta de "Gabino" a Piedad Córdoba, en la cual no se plantea ni siquiera "esperar un mejor gobierno". +1-3 Digámoslo _ _ _ _ _ _ _ _ +1 Digám digám VERB _ VerbForm=Fin 0 root _ _ +2 os tú PRON _ Case=Acc,Dat|Number=Plur|Person=2|PrepCase=Npr|PronType=Prs 1 iobj _ _ +3 lo él PRON _ Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs 1 obj _ _ +4 claramente claramente ADV _ _ 1 advmod _ SpaceAfter=No +5 , , PUNCT _ _ 1 punct _ _ +6 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 7 det _ _ +7 insurgencia insurgencia NOUN _ Gender=Fem|Number=Sing 10 nsubj _ _ +8 se él PRON _ Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes 10 iobj _ _ +9 ha haber AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 10 aux _ _ +10 pronunciado pronunciar VERB _ Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part 1 parataxis _ _ +11 mucho mucho PRON _ NumType=Card|PronType=Ind 12 nmod _ _ +12 más más ADV _ Degree=Cmp 13 advmod _ _ +13 claramente claramente ADV _ _ 10 advmod _ _ +14 respecto respecto NOUN _ Gender=Masc|Number=Sing 17 nmod _ _ +15-16 al _ _ _ _ _ _ _ _ +15 a a ADP _ _ 14 fixed _ _ +16 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 14 det _ _ +17 tema tema NOUN _ Gender=Masc|Number=Sing 10 obl _ _ +18 de de ADP _ _ 20 case _ _ +19 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 20 det _ _ +20 paz paz NOUN _ Gender=Fem|Number=Sing 17 nmod _ _ +21 que que CCONJ _ _ 23 case _ _ +22 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 23 det _ _ +23 Estado estado NOUN _ _ 12 nmod _ SpaceAfter=No +24 , , PUNCT _ _ 27 punct _ _ +25 como como SCONJ _ _ 27 mark _ _ +26 lo él PRON _ Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs 27 obj _ _ +27 demuestra demostrar VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 10 advcl _ _ +28 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 29 det _ _ +29 fragmento fragmento NOUN _ Gender=Masc|Number=Sing 27 nsubj _ _ +30 que que SCONJ _ _ 32 mark _ _ +31 Bermúdez bermúdez PROPN _ _ 32 nsubj _ _ +32 cita cita VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 29 acl:relcl _ _ +33 de de ADP _ _ 35 case _ _ +34 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 35 det _ _ +35 respuesta respuesta NOUN _ Gender=Fem|Number=Sing 29 nmod _ _ +36 de de ADP _ _ 38 case _ _ +37 " " PUNCT _ _ 38 punct _ SpaceAfter=No +38 Gabino gabino PROPN _ _ 35 nmod _ SpaceAfter=No +39 " " PUNCT _ _ 38 punct _ _ +40 a a ADP _ _ 41 case _ _ +41 Piedad piedad PROPN _ _ 35 nmod _ _ +42 Córdoba córdoba PROPN _ _ 41 flat _ SpaceAfter=No +43 , , PUNCT _ _ 49 punct _ _ +44 en en ADP _ _ 46 case _ _ +45 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 46 det _ _ +46 cual cual PRON _ Number=Sing|PronType=Int,Rel 49 mark _ _ +47 no no ADV _ Polarity=Neg 49 advmod _ _ +48 se él PRON _ Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes 49 iobj _ _ +49 plantea plantear VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 35 acl:relcl _ _ +50 ni ni CCONJ _ Polarity=Neg 53 advmod _ _ +51 siquiera siquiera ADV _ _ 50 fixed _ _ +52 " " PUNCT _ _ 53 punct _ SpaceAfter=No +53 esperar esperar VERB _ VerbForm=Inf 49 csubj _ _ +54 un uno DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 56 det _ _ +55 mejor mejor ADJ _ Degree=Cmp|Number=Sing 56 amod _ _ +56 gobierno gobierno NOUN _ Gender=Masc|Number=Sing 53 obj _ SpaceAfter=No +57 " " PUNCT _ _ 53 punct _ SpaceAfter=No +58 . . PUNCT _ _ 1 punct _ _ http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu new file mode 100644 index 0000000..f616044 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu @@ -0,0 +1,76 @@ +# sent_id = train-s2 +# text = Numa reunião entre representantes da Secretaria da Criança do DF ea juÃza da Vara de Execuções de Medidas Socioeducativas, LavÃnia Tupi Vieira Fonseca, ficou acordado que dos 25 internos, 12 serão internados na Unidade de Planaltina e os outros 13 devem retornar para a Unidade do Recanto das Emas, antigo Ciago. +1-2 Numa _ _ _ _ _ _ _ _ +1 Em _ ADP ADP _ 3 case _ _ +2 uma _ DET DET _ 3 det _ _ +3 reunião _ NOUN NOUN _ 31 nmod _ _ +4 entre _ ADP ADP _ 5 case _ _ +5 representantes _ NOUN NOUN _ 3 nmod _ _ +6-7 da _ _ _ _ _ _ _ _ +6 de de ADP ADP _ 8 case _ _ +7 a o DET DET Definite=Def|Gender=Fem|Number=Sing|PronType=Art 8 det _ _ +8 Secretaria _ PROPN PNOUN _ 5 nmod _ _ +9-10 da _ _ _ _ _ _ _ _ +9 de de ADP ADP _ 11 case _ _ +10 a o DET DET Definite=Def|Gender=Fem|Number=Sing|PronType=Art 11 det _ _ +11 Criança _ PROPN PNOUN _ 8 nmod _ _ +12-13 do _ _ _ _ _ _ _ _ +12 de de ADP ADP _ 14 case _ _ +13 o o DET DET Definite=Def|Gender=Masc|Number=Sing|PronType=Art 14 det _ _ +14 DF _ PROPN PNOUN _ 8 nmod _ _ +15 ea _ CCONJ CONJ _ 16 cc _ _ +16 juÃza _ NOUN NOUN _ 5 conj _ _ +17-18 da _ _ _ _ _ _ _ _ +17 de de ADP ADP _ 19 case _ _ +18 a o DET DET Definite=Def|Gender=Fem|Number=Sing|PronType=Art 19 det _ _ +19 Vara _ PROPN PNOUN _ 16 nmod _ _ +20 de _ ADP ADP _ 21 case _ _ +21 Execuções _ PROPN PNOUN _ 19 nmod _ _ +22 de _ ADP ADP _ 23 case _ _ +23 Medidas _ PROPN PNOUN _ 21 nmod _ _ +24 Socioeducativas _ PROPN PNOUN _ 23 amod _ SpaceAfter=No +25 , _ PUNCT . _ 26 punct _ _ +26 LavÃnia _ PROPN PNOUN _ 16 appos _ _ +27 Tupi _ PROPN PNOUN _ 26 flat _ _ +28 Vieira _ PROPN PNOUN _ 26 flat _ _ +29 Fonseca _ PROPN PNOUN _ 26 flat _ SpaceAfter=No +30 , _ PUNCT . _ 3 punct _ _ +31 ficou _ VERB VERB _ 0 root _ _ +32 acordado _ ADJ ADJ _ 31 xcomp:adj _ _ +33 que _ CCONJ CONJ _ 41 mark _ _ +34-35 dos _ _ _ _ _ _ _ _ +34 de de ADP ADP _ 37 case _ _ +35 os o DET DET Definite=Def|Gender=Masc|Number=Plur|PronType=Art 37 det _ _ +36 25 _ NUM NUM NumType=Card 37 nummod _ _ +37 internos _ NOUN NOUN _ 41 nmod _ SpaceAfter=No +38 , _ PUNCT . _ 37 punct _ _ +39 12 _ NUM NUM NumType=Card 41 nsubj:pass _ _ +40 serão _ AUX AUX _ 41 aux:pass _ _ +41 internados _ VERB VERB _ 31 csubj _ _ +42-43 na _ _ _ _ _ _ _ _ +42 en en ADP ADP _ 44 case _ _ +43 a o DET DET Definite=Def|Gender=Fem|Number=Sing|PronType=Art 44 det _ _ +44 Unidade _ PROPN PNOUN _ 41 nmod _ _ +45 de _ ADP ADP _ 46 case _ _ +46 Planaltina _ PROPN PNOUN _ 44 nmod _ _ +47 e _ CCONJ CONJ _ 52 cc _ _ +48 os _ DET DET _ 50 det _ _ +49 outros _ DET DET _ 50 det _ _ +50 13 _ NUM NUM NumType=Card 52 nsubj _ _ +51 devem _ AUX AUX _ 52 aux _ _ +52 retornar _ VERB VERB _ 41 conj _ _ +53 para _ ADP ADP _ 55 case _ _ +54 a _ DET DET _ 55 det _ _ +55 Unidade _ PROPN PNOUN _ 52 nmod _ _ +56-57 do _ _ _ _ _ _ _ _ +56 de de ADP ADP _ 58 case _ _ +57 o o DET DET Definite=Def|Gender=Masc|Number=Sing|PronType=Art 58 det _ _ +58 Recanto _ NOUN NOUN _ 55 nmod _ _ +59-60 das _ _ _ _ _ _ _ _ +59 de de ADP ADP _ 61 case _ _ +60 as o DET DET Definite=Def|Gender=Fem|Number=Plur|PronType=Art 61 det _ _ +61 Emas _ PROPN PNOUN _ 58 nmod _ SpaceAfter=No +62 , _ PUNCT . _ 64 punct _ _ +63 antigo _ ADJ ADJ _ 64 amod _ _ +64 Ciago _ PROPN PNOUN _ 55 appos _ SpaceAfter=No +65 . _ PUNCT . _ 31 punct _ _
