Author: rwesten
Date: Wed Oct 22 06:37:36 2014
New Revision: 1633539
URL: http://svn.apache.org/r1633539
Log:
implementation for STANBOL-1396
Modified:
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/SectionImpl.java
stanbol/branches/release-0.12/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java
Modified:
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java?rev=1633539&r1=1633538&r2=1633539&view=diff
==============================================================================
---
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java
(original)
+++
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java
Wed Oct 22 06:37:36 2014
@@ -39,6 +39,24 @@ public interface Section extends Span {
Iterator<Span> getEnclosed(Set<SpanTypeEnum> types);
/**
+ * Iterates over all enclosed Span within the parsed window. Only Spans
+ * with on of the parsed types are returned.
+ * <p>
+ * The parsed window (start/end indexes) are relative to the section. If
+ * the parsed window exceeds the Section the window adapted to the section.
+ * This means that this method will never return Spans outside the section.
+ * <p>
+ * Returned Iterators MUST NOT throw {@link
ConcurrentModificationException}
+ * but consider additions of Spans.
+ * @param types the {@link SpanTypeEnum types} of Spans included
+ * @param startOffset the start offset relative to the start position of
this {@link Section}
+ * @param endOffset the end offset relative to the start position of this
{@link Section}.
+ * @return sorted iterator over the selected Spans.
+ * @since 0.12.1
+ */
+ Iterator<Span> getEnclosed(Set<SpanTypeEnum> types, int startOffset, int
endOffset);
+
+ /**
* Adds an Token relative to this Sentence
* @param start the start of the token relative to the sentence
* @param end
Modified:
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/SectionImpl.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/SectionImpl.java?rev=1633539&r1=1633538&r2=1633539&view=diff
==============================================================================
---
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/SectionImpl.java
(original)
+++
stanbol/branches/release-0.12/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/SectionImpl.java
Wed Oct 22 06:37:36 2014
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.nlp.model.impl;
+import java.util.Collections;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.NavigableMap;
@@ -74,6 +75,28 @@ public abstract class SectionImpl extend
}
});
}
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public Iterator<Span> getEnclosed(final Set<SpanTypeEnum> types, int
startOffset, int endOffset) {
+ if(startOffset >= (span[1] - span[0])){ //start is outside the span
+ return Collections.<Span>emptySet().iterator();
+ }
+ int startIdx = startOffset < 0 ? span[0] : (span[0]+ startOffset);
+ int endIdx = span[0] + endOffset;
+ if(endIdx <= startIdx) {
+ return Collections.<Span>emptySet().iterator();
+ } else if(endIdx > span[1]){
+ endIdx = span[1];
+ }
+ return IteratorUtils.filteredIterator(getIterator(new
SubSetHelperSpan(startIdx, endIdx)),
+ new Predicate() {
+ @Override
+ public boolean evaluate(Object span) {
+ return types.contains(((Span)span).getType());
+ }
+ });
+ }
/**
* Iterator that does not throw {@link ConcurrentModificationException} but
* considers modifications to the underlying set by using the
@@ -85,13 +108,32 @@ public abstract class SectionImpl extend
* @return the iterator
*/
protected Iterator<Span> getIterator(){
- //the end of this section
- final Span end = new SubSetHelperSpan(getEnd());
+ return getIterator(null);
+ }
+ /**
+ * Iterator that does not throw {@link ConcurrentModificationException} but
+ * considers modifications to the underlying set by using the
+ * {@link NavigableMap#higherKey(Object)} method for iterating over the
+ * Elements!<p>
+ * This allows to add new {@link Span}s to the {@link Section} while
+ * iterating (e.g. add {@link Token}s and/or {@link Chunk}s while iterating
+ * over the {@link Sentence}s of an {@link AnalysedText})
+ * @param section the (sub-)section of the current section to iterate or
+ * <code>null</code> to iterate the whole section.
+ * @return the iterator
+ */
+ protected Iterator<Span> getIterator(final SubSetHelperSpan section){
+ //create a virtual Span with the end of the section to iterate over
+ final Span end = new SubSetHelperSpan(
+ section == null ? getEnd() : //if no section is defined use the
parent
+ section.getEnd()); //use the end of the desired section
return new Iterator<Span>() {
boolean init = false;
boolean removed = true;
- private Span span = SectionImpl.this;
+ //init with the first span of the iterator
+ private Span span = section == null ?
+ SectionImpl.this : section;
@Override
public boolean hasNext() {
Modified:
stanbol/branches/release-0.12/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java?rev=1633539&r1=1633538&r2=1633539&view=diff
==============================================================================
---
stanbol/branches/release-0.12/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java
(original)
+++
stanbol/branches/release-0.12/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java
Wed Oct 22 06:37:36 2014
@@ -19,21 +19,26 @@ package org.apache.stanbol.enhancer.nlp.
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.collections.CollectionUtils;
import
org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.utils.NIFHelper;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
@@ -383,6 +388,65 @@ public class AnalysedTextTest {
(sentences.size()*chunksInSentence),c);
}
+ /**
+ * Tests the {@link Section#getEnclosed(Set, int, int)} method introduced
+ * with <code>0.12.1</code>
+ */
+ @Test
+ public void testSubSectionIteration(){
+ log.info("testSubSectionIteration ...");
+ List<Span> expectedSpans = new ArrayList<Span>();
+ List<Sentence> sentences = new ArrayList<Sentence>();
+ Set<SpanTypeEnum> enabledTypes = EnumSet.of(SpanTypeEnum.Sentence,
SpanTypeEnum.Token);
+ Iterator<Span> allIt = analysedTextWithData.getEnclosed(enabledTypes);
+ while(allIt.hasNext()){
+ Span s = allIt.next();
+ expectedSpans.add(s);
+ if(s.getType() == SpanTypeEnum.Sentence) {
+ sentences.add((Sentence)s);
+ }
+ }
+ //first test an section that exceeds the end of the text
+ int[] testSpan = new int[]{4,90};
+ Assert.assertEquals(5, assertSectionIterator(analysedTextWithData,
+ expectedSpans, testSpan, enabledTypes));
+ //second test a section relative to an sentence
+ Sentence lastSent = sentences.get(sentences.size()-1);
+ int [] offsetSpan = new int[]{5,25};
+ Assert.assertEquals(1, assertSectionIterator(lastSent, expectedSpans,
+ offsetSpan, enabledTypes));
+
+ }
+
+
+ /**
+ * @param span
+ * @param testSpan
+ */
+ private int assertSectionIterator(Section section, List<Span> span, int[]
testSpan, Set<SpanTypeEnum> types) {
+ log.info("> assert span {} over {}", Arrays.toString(testSpan),
section);
+ Iterator<Span> sectionIt = section.getEnclosed(
+ types, testSpan[0], testSpan[1]);
+ int startIdx = section.getStart() + testSpan[0];
+ int endIdx = section.getStart() + testSpan[1];
+ int count = 0;
+ for(Span s : span){
+ if(s.getStart() < startIdx){
+ log.info(" - asserted {} before section", s);
+ } else if(s.getEnd() < endIdx){
+ Assert.assertTrue(sectionIt.hasNext());
+ Assert.assertEquals(s, sectionIt.next());
+ count++;
+ log.info(" - asserted section token {}", s);
+ } else {
+ log.info(" - asserted correct section end", s);
+ Assert.assertFalse(sectionIt.hasNext());
+ break;
+ }
+ }
+ return count;
+ }
+
@Test
public void testAnnotation(){
List<Value<Number>> values = new ArrayList<Value<Number>>();