OPENNLP-1050: Add formats support for Irish Sentence Bank closes #191
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6f80a897 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6f80a897 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6f80a897 Branch: refs/heads/LangDetect Commit: 6f80a89705d84dd74da902d512ca4682aed07a57 Parents: 5bf5366 Author: Jim O'Regan <[email protected]> Authored: Sun Apr 30 21:25:03 2017 +0100 Committer: Jörn Kottmann <[email protected]> Committed: Wed May 24 16:52:42 2017 +0200 ---------------------------------------------------------------------- .../tools/cmdline/StreamFactoryRegistry.java | 5 + .../IrishSentenceBankDocument.java | 271 +++++++++++++++++++ .../IrishSentenceBankSentenceStream.java | 72 +++++ .../IrishSentenceBankSentenceStreamFactory.java | 61 +++++ .../IrishSentenceBankTokenSampleStream.java | 52 ++++ ...ishSentenceBankTokenSampleStreamFactory.java | 60 ++++ .../IrishSentenceBankDocumentTest.java | 67 +++++ .../irishsentencebank-sample.xml | 25 ++ 8 files changed, 613 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index 2cff212..3d68945 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -54,6 +54,8 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory; import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory; import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory; import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory; +import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory; +import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory; import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory; import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory; import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory; @@ -119,6 +121,9 @@ public final class StreamFactoryRegistry { ConlluSentenceSampleStreamFactory.registerFactory(); ConlluPOSSampleStreamFactory.registerFactory(); ConlluLemmaSampleStreamFactory.registerFactory(); + + IrishSentenceBankSentenceStreamFactory.registerFactory(); + IrishSentenceBankTokenSampleStreamFactory.registerFactory(); } public static final String DEFAULT_FORMAT = "opennlp"; http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java new file mode 100644 index 0000000..91ab650 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.lang.StringBuilder; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.Span; + +/** + * A structure to hold an Irish Sentence Bank document, which is a collection + * of tokenized sentences. + * <p> + * The sentence bank can be downloaded from, and is described + * <a href="http://www.lexiconista.com/datasets/sentencebank-ga/">here</a> + */ +public class IrishSentenceBankDocument { + + public static class IrishSentenceBankFlex { + String surface; + String[] flex; + public String getSurface() { + return surface; + } + public String[] getFlex() { + return flex; + } + public IrishSentenceBankFlex(String sf, String[] fl) { + this.surface = sf; + this.flex = fl; + } + } + + public static class IrishSentenceBankSentence { + private String source; + private String translation; + private String original; + private Span[] tokens; + private IrishSentenceBankFlex[] flex; + public String getSource() { + return source; + } + public String getTranslation() { + return translation; + } + public String getOriginal() { + return original; + } + public Span[] getTokens() { + return tokens; + } + public IrishSentenceBankFlex[] getFlex() { + return flex; + } + public TokenSample getTokenSample() { + return new TokenSample(original, tokens); + } + public IrishSentenceBankSentence(String src, String trans, String orig, + Span[] toks, IrishSentenceBankFlex[] flx) { + this.source = src; + this.translation = trans; + this.original = orig; + this.tokens = toks; + this.flex = flx; + } + } + + private List<IrishSentenceBankSentence> sentences; + + public IrishSentenceBankDocument() { + sentences = new ArrayList<IrishSentenceBankSentence>(); + } + + public void add(IrishSentenceBankSentence sent) { + this.sentences.add(sent); + } + + public List<IrishSentenceBankSentence> getSentences() { + return Collections.unmodifiableList(sentences); + } + + /** + * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string + * @param s the string to check + * @param start the offset of the start of the string + * @return the offset adjusted to ignore spaces to the left + */ + private static int advanceLeft(String s, int start) { + int ret = start; + for (char c : s.toCharArray()) { + if (c == ' ') { + ret++; + } else { + return ret; + } + } + return ret; + } + + /** + * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string + * @param s the string to check + * @param start the offset of the start of the string + * @return the offset of the end of the string, adjusted to ignore spaces to the right + */ + private static int advanceRight(String s, int start) { + int end = s.length() - 1; + int ret = start + end + 1; + for (int i = end; i > 0; i--) { + if (s.charAt(i) == ' ') { + ret--; + } else { + return ret; + } + } + return ret; + } + + public static IrishSentenceBankDocument parse(InputStream is) throws IOException { + IrishSentenceBankDocument document = new IrishSentenceBankDocument(); + + try { + DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); + Document doc = docBuilder.parse(is); + + String root = doc.getDocumentElement().getNodeName(); + if (!root.equalsIgnoreCase("sentences")) { + throw new IOException("Expected root node " + root); + } + + NodeList nl = doc.getDocumentElement().getChildNodes(); + for (int i = 0; i < nl.getLength(); i++) { + Node sentnode = nl.item(i); + if (sentnode.getNodeName().equals("sentence")) { + String src = sentnode.getAttributes().getNamedItem("source").getNodeValue(); + String trans = ""; + Map<Integer, String> toks = new HashMap<>(); + Map<Integer, List<String>> flx = new HashMap<>(); + List<Span> spans = new ArrayList<>(); + NodeList sentnl = sentnode.getChildNodes(); + int flexes = 1; + StringBuilder orig = new StringBuilder(); + + for (int j = 0; j < sentnl.getLength(); j++) { + final String name = sentnl.item(j).getNodeName(); + switch (name) { + case "flex": + String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue(); + Integer flexslot = Integer.parseInt(slottmpa); + if (flexslot > flexes) { + flexes = flexslot; + } + + flx.computeIfAbsent(flexslot, k -> new ArrayList<>()); + String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue(); + flx.get(flexslot).add(tkn); + break; + + case "translation": + trans = sentnl.item(j).getFirstChild().getTextContent(); + break; + + case "original": + int last = 0; + NodeList orignl = sentnl.item(j).getChildNodes(); + for (int k = 0; k < orignl.getLength(); k++) { + switch (orignl.item(k).getNodeName()) { + case "token": + String tmptok = orignl.item(k).getFirstChild().getTextContent(); + spans.add(new Span(last, last + tmptok.length())); + + String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue(); + Integer tokslot = Integer.parseInt(slottmpb); + if (tokslot > flexes) { + flexes = tokslot; + } + + toks.put(tokslot, tmptok); + orig.append(tmptok); + last += tmptok.length(); + break; + + case "#text": + String tmptxt = orignl.item(k).getTextContent(); + orig.append(tmptxt); + + if (!" ".equals(tmptxt)) { + spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last))); + } + + last += tmptxt.length(); + break; + + default: + throw new IOException("Unexpected node: " + orignl.item(k).getNodeName()); + } + } + break; + + case "#text": + case "#comment": + break; + + default: + throw new IOException("Unexpected node: " + name); + } + } + IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes]; + for (Integer flexidx : toks.keySet()) { + String left = toks.get(flexidx); + int rsize = flx.get(flexidx).size(); + String[] right = new String[rsize]; + right = flx.get(flexidx).toArray(right); + flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right); + } + + Span[] spanout = new Span[spans.size()]; + spanout = spans.toArray(spanout); + document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa)); + } else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) { + throw new IOException("Unexpected node: " + sentnode.getNodeName()); + } + } + return document; + } catch (ParserConfigurationException e) { + throw new IllegalStateException(e); + } catch (SAXException e) { + throw new IOException("Failed to parse IrishSentenceBank document", e); + } + } + + static IrishSentenceBankDocument parse(File file) throws IOException { + try (InputStream in = new FileInputStream(file)) { + return parse(in); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java new file mode 100644 index 0000000..e7c06d1 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +class IrishSentenceBankSentenceStream implements ObjectStream<SentenceSample> { + + private final IrishSentenceBankDocument source; + + private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt; + + IrishSentenceBankSentenceStream(IrishSentenceBankDocument source) { + this.source = source; + reset(); + } + + @Override + public SentenceSample read() throws IOException { + + StringBuilder sentencesString = new StringBuilder(); + List<Span> sentenceSpans = new LinkedList<>(); + + while (sentenceIt.hasNext()) { + IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next(); + + int begin = sentencesString.length(); + + if (sentence.getOriginal() != null) { + sentencesString.append(sentence.getOriginal()); + } + + sentenceSpans.add(new Span(begin, sentencesString.length())); + sentencesString.append(' '); + } + + // end of stream is reached, indicate that with null return value + if (sentenceSpans.size() == 0) { + return null; + } + + return new SentenceSample(sentencesString.toString(), + sentenceSpans.toArray(new Span[sentenceSpans.size()])); + } + + @Override + public void reset() { + sentenceIt = source.getSentences().iterator(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java new file mode 100644 index 0000000..e26dc56 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +public class IrishSentenceBankSentenceStreamFactory extends AbstractSampleStreamFactory<SentenceSample> { + + interface Parameters extends BasicFormatParams { + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(SentenceSample.class, + "irishsentencebank", new IrishSentenceBankSentenceStreamFactory( + IrishSentenceBankSentenceStreamFactory.Parameters.class)); + } + + protected <P> IrishSentenceBankSentenceStreamFactory(Class<P> params) { + super(params); + } + + @Override + public ObjectStream<SentenceSample> create(String[] args) { + + Parameters params = ArgumentParser.parse(args, Parameters.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + + IrishSentenceBankDocument isbDoc = null; + try { + isbDoc = IrishSentenceBankDocument.parse(params.getData()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new IrishSentenceBankSentenceStream(isbDoc); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java new file mode 100644 index 0000000..8cbfac2 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; +import java.util.Iterator; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +class IrishSentenceBankTokenSampleStream implements ObjectStream<TokenSample> { + + private final IrishSentenceBankDocument source; + + private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt; + + IrishSentenceBankTokenSampleStream(IrishSentenceBankDocument source) { + this.source = source; + reset(); + } + + @Override + public TokenSample read() throws IOException { + + if (sentenceIt.hasNext()) { + IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next(); + return sentence.getTokenSample(); + } else { + return null; + } + } + + @Override + public void reset() { + sentenceIt = source.getSentences().iterator(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java new file mode 100644 index 0000000..86d1225 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.DetokenizerSampleStreamFactory; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +public class IrishSentenceBankTokenSampleStreamFactory extends DetokenizerSampleStreamFactory<TokenSample> { + + interface Parameters extends BasicFormatParams { + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(TokenSample.class, + "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory( + IrishSentenceBankTokenSampleStreamFactory.Parameters.class)); + } + + protected <P> IrishSentenceBankTokenSampleStreamFactory(Class<P> params) { + super(params); + } + + public ObjectStream<TokenSample> create(String[] args) { + + Parameters params = ArgumentParser.parse(args, Parameters.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + + IrishSentenceBankDocument isbDoc = null; + try { + isbDoc = IrishSentenceBankDocument.parse(params.getData()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new IrishSentenceBankTokenSampleStream(isbDoc); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java new file mode 100644 index 0000000..671fea0 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.Span; + +public class IrishSentenceBankDocumentTest { + + @Test + public void testParsingSimpleDoc() throws IOException { + try (InputStream irishSBXmlIn = + IrishSentenceBankDocumentTest.class.getResourceAsStream("irishsentencebank-sample.xml")) { + + IrishSentenceBankDocument doc = IrishSentenceBankDocument.parse(irishSBXmlIn); + + List<IrishSentenceBankDocument.IrishSentenceBankSentence> sents = doc.getSentences(); + + Assert.assertEquals(2, sents.size()); + + IrishSentenceBankDocument.IrishSentenceBankSentence sent1 = sents.get(0); + IrishSentenceBankDocument.IrishSentenceBankSentence sent2 = sents.get(1); + + Assert.assertEquals("A Dhia, tá mé ag iompar clainne!", sent1.getOriginal()); + + IrishSentenceBankDocument.IrishSentenceBankFlex[] flex = sent1.getFlex(); + Assert.assertEquals(7, flex.length); + Assert.assertEquals("A", flex[0].getSurface()); + Assert.assertArrayEquals(new String[]{"a"}, flex[0].getFlex()); + + IrishSentenceBankDocument.IrishSentenceBankFlex[] flex2 = sent2.getFlex(); + Assert.assertEquals("ón", flex2[4].getSurface()); + Assert.assertArrayEquals(new String[]{"ó", "an"}, flex2[4].getFlex()); + + Assert.assertEquals("Excuse me, are you from the stone age?", sent2.getTranslation()); + + TokenSample ts = sent1.getTokenSample(); + Span[] spans = ts.getTokenSpans(); + Assert.assertEquals(9, spans.length); + Assert.assertEquals(24, spans[7].getStart()); + Assert.assertEquals(31, spans[7].getEnd()); + Assert.assertEquals("clainne", ts.getText().substring(spans[7].getStart(), spans[7].getEnd())); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml new file mode 100644 index 0000000..91e84c1 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml @@ -0,0 +1,25 @@ +<sentences datestamp="2015-03-10"> +<sentence source='potaL'> + <original xml:space="preserve"><token slot='1'>A</token> <token slot='2'>Dhia</token>, <token slot='3'>tá</token> <token slot='4'>mé</token> <token slot='5'>ag</token> <token slot='6'>iompar</token> <token slot='7'>clainne</token>!</original> + <translation>Oh my God, I'm pregnant!</translation> + <flex slot='1' lemma='a'/> + <flex slot='2' lemma='dia'/> + <flex slot='3' lemma='bÃ'/> + <flex slot='4' lemma='mé'/> + <flex slot='5' lemma='ag'/> + <flex slot='6' lemma='iompair'/> + <flex slot='7' lemma='clann'/> +</sentence> +<sentence source='potaL'> + <original xml:space="preserve"><token slot='1'>Gabh</token> <token slot='2'>mo</token> <token slot='3'>leithscéal</token>, <token slot='4'>an</token> <token slot='5'>ón</token> <token slot='6'>chlochaois</token> <token slot='7'>thú</token>?</original> + <translation>Excuse me, are you from the stone age?</translation> + <flex slot='1' lemma='gabh'/> + <flex slot='2' lemma='mo'/> + <flex slot='3' lemma='leithscéal'/> + <flex slot='4' lemma='an'/> + <flex slot='5' lemma='ó'/> + <flex slot='5' lemma='an'/> + <flex slot='6' lemma='clochaois'/> + <flex slot='7' lemma='thú'/> +</sentence> +</sentences>
