OPENNLP-1079 Added BratDocumentParser. Closed Annotation stream in BratDocument
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/e9728694 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/e9728694 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/e9728694 Branch: refs/heads/LangDetect Commit: e972869486f85c3424875a443eb04bda2eeb6bd3 Parents: 1aa5432 Author: Daniel Russ <[email protected]> Authored: Thu May 25 14:57:27 2017 -0400 Committer: Daniel Russ <[email protected]> Committed: Thu May 25 14:59:45 2017 -0400 ---------------------------------------------------------------------- .../tools/formats/brat/BratDocument.java | 1 + .../tools/formats/brat/BratDocumentParser.java | 149 +++++++++++++++++++ .../formats/brat/BratNameSampleStream.java | 120 +-------------- 3 files changed, 154 insertions(+), 116 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java index 1b9aee2..51723be 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java @@ -91,6 +91,7 @@ public class BratDocument { while ((ann = annStream.read()) != null) { annotations.add(ann); } + annStream.close(); return new BratDocument(config, id, text.toString(), annotations); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java new file mode 100644 index 0000000..24ba887 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.brat; + + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import opennlp.tools.namefind.NameSample; +import opennlp.tools.sentdetect.SentenceDetector; +import opennlp.tools.tokenize.Tokenizer; +import opennlp.tools.util.Span; + +public class BratDocumentParser { + + private SentenceDetector sentDetector; + private Tokenizer tokenizer; + + public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) { + this.sentDetector = sentenceDetector; + this.tokenizer = tokenizer; + } + + public List<NameSample> parse(BratDocument sample) { + // Note: Some entities might not match sentence boundaries, + // to be able to print warning a set of entities id must be maintained + // to check if all entities have been used up after the matching is done + + Set<String> entityIdSet = new HashSet<>(); + Map<Integer, Span> coveredIndexes = new HashMap<>(); + + for (BratAnnotation ann : sample.getAnnotations()) { + if (ann instanceof SpanAnnotation) { + entityIdSet.add(ann.getId()); + + Span span = ((SpanAnnotation) ann).getSpan(); + for (int i = span.getStart(); i < span.getEnd(); i++) { + coveredIndexes.put(i, span); + } + } + } + + List<Span> sentences = new ArrayList<>(); + for (Span sentence : sentDetector.sentPosDetect(sample.getText())) { + Span conflictingName = coveredIndexes.get(sentence.getStart()); + + if (sentences.size() > 0 && conflictingName != null && + conflictingName.getStart() < sentence.getStart()) { + Span lastSentence = sentences.remove(sentences.size() - 1); + sentences.add(new Span(lastSentence.getStart(), sentence.getEnd())); + + System.out.println("Correcting sentence segmentation in document " + + sample.getId()); + } + else { + sentences.add(sentence); + } + } + + // TODO: Token breaks should be enforced on name span boundaries + // a) Just split tokens + // b) Implement a custom token split validator which can be injected into the Tokenizer + + // Currently we are missing all + + List<NameSample> samples = new ArrayList<>(sentences.size()); + + for (Span sentence : sentences) { + + String sentenceText = sentence.getCoveredText( + sample.getText()).toString(); + + Span[] tokens = tokenizer.tokenizePos(sentenceText); + + // Note: + // A begin and end token index can be identical, but map to different + // tokens, to distinguish between between the two begin indexes are + // stored with a negative sign, and end indexes are stored with a positive sign + // in the tokenIndexMap. + // The tokenIndexMap maps to the sentence local token index. + + Map<Integer, Integer> tokenIndexMap = new HashMap<>(); + + for (int i = 0; i < tokens.length; i++) { + tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i); + tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i + 1); + } + + List<Span> names = new ArrayList<>(); + + for (BratAnnotation ann : sample.getAnnotations()) { + + if (ann instanceof SpanAnnotation) { + SpanAnnotation entity = (SpanAnnotation) ann; + + Span entitySpan = entity.getSpan(); + + if (sentence.contains(entitySpan)) { + entityIdSet.remove(ann.getId()); + + entitySpan = entitySpan.trim(sample.getText()); + + Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart()); + Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd()); + + if (nameBeginIndex != null && nameEndIndex != null) { + names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType())); + } + else { + System.err.println("Dropped entity " + entity.getId() + " (" + + entitySpan.getCoveredText(sample.getText()) + ") " + " in document " + + sample.getId() + ", it is not matching tokenization!"); + } + } + } + } + + samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText), + names.toArray(new Span[names.size()]), null, samples.size() == 0)); + } + + for (String id : entityIdSet) { + System.err.println("Dropped entity " + id + " in document " + + sample.getId() + ", is not matching sentence segmentation!"); + } + + return samples; + } +} + http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java index 569f450..cc066ad 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java @@ -18,12 +18,7 @@ package opennlp.tools.formats.brat; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; import java.util.List; -import java.util.Map; -import java.util.Set; import opennlp.tools.namefind.NameSample; import opennlp.tools.sentdetect.SentenceDetector; @@ -33,22 +28,19 @@ import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.Span; /** * Generates Name Sample objects for a Brat Document object. */ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, NameSample> { - private SentenceDetector sentDetector; - private Tokenizer tokenizer; + private final BratDocumentParser parser; public BratNameSampleStream(SentenceDetector sentDetector, Tokenizer tokenizer, ObjectStream<BratDocument> samples) { super(samples); - this.sentDetector = sentDetector; - this.tokenizer = tokenizer; + this.parser = new BratDocumentParser(sentDetector, tokenizer); } public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel, @@ -56,115 +48,11 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na super(samples); // TODO: We can pass in custom validators here ... - this.sentDetector = new SentenceDetectorME(sentModel); - this.tokenizer = new TokenizerME(tokenModel); + this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel)); } @Override protected List<NameSample> read(BratDocument sample) throws IOException { - - // Note: Some entities might not match sentence boundaries, - // to be able to print warning a set of entities id must be maintained - // to check if all entities have been used up after the matching is done - - Set<String> entityIdSet = new HashSet<>(); - Map<Integer, Span> coveredIndexes = new HashMap<>(); - - for (BratAnnotation ann : sample.getAnnotations()) { - if (ann instanceof SpanAnnotation) { - entityIdSet.add(ann.getId()); - - Span span = ((SpanAnnotation) ann).getSpan(); - for (int i = span.getStart(); i < span.getEnd(); i++) { - coveredIndexes.put(i, span); - } - } - } - - List<Span> sentences = new ArrayList<>(); - for (Span sentence : sentDetector.sentPosDetect(sample.getText())) { - Span conflictingName = coveredIndexes.get(sentence.getStart()); - - if (sentences.size() > 0 && conflictingName != null && - conflictingName.getStart() < sentence.getStart()) { - Span lastSentence = sentences.remove(sentences.size() - 1); - sentences.add(new Span(lastSentence.getStart(), sentence.getEnd())); - - System.out.println("Correcting sentence segmentation in document " + - sample.getId()); - } - else { - sentences.add(sentence); - } - } - - // TODO: Token breaks should be enforced on name span boundaries - // a) Just split tokens - // b) Implement a custom token split validator which can be injected into the Tokenizer - - // Currently we are missing all - - List<NameSample> samples = new ArrayList<>(sentences.size()); - - for (Span sentence : sentences) { - - String sentenceText = sentence.getCoveredText( - sample.getText()).toString(); - - Span[] tokens = tokenizer.tokenizePos(sentenceText); - - // Note: - // A begin and end token index can be identical, but map to different - // tokens, to distinguish between between the two begin indexes are - // stored with a negative sign, and end indexes are stored with a positive sign - // in the tokenIndexMap. - // The tokenIndexMap maps to the sentence local token index. - - Map<Integer, Integer> tokenIndexMap = new HashMap<>(); - - for (int i = 0; i < tokens.length; i++) { - tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i); - tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i + 1); - } - - List<Span> names = new ArrayList<>(); - - for (BratAnnotation ann : sample.getAnnotations()) { - - if (ann instanceof SpanAnnotation) { - SpanAnnotation entity = (SpanAnnotation) ann; - - Span entitySpan = entity.getSpan(); - - if (sentence.contains(entitySpan)) { - entityIdSet.remove(ann.getId()); - - entitySpan = entitySpan.trim(sample.getText()); - - Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart()); - Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd()); - - if (nameBeginIndex != null && nameEndIndex != null) { - names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType())); - } - else { - System.err.println("Dropped entity " + entity.getId() + " (" - + entitySpan.getCoveredText(sample.getText()) + ") " + " in document " - + sample.getId() + ", it is not matching tokenization!"); - } - } - } - } - - samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText), - names.toArray(new Span[names.size()]), null, samples.size() == 0)); - } - - for (String id : entityIdSet) { - System.err.println("Dropped entity " + id + " in document " + - sample.getId() + ", is not matching sentence segmentation!"); - } - - return samples; + return parser.parse(sample); } }
