Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java Wed Sep 19 08:48:32 2012 @@ -0,0 +1,179 @@ +package org.apache.stanbol.enhancer.nlp.utils; + +import static java.util.Collections.singleton; + +import java.io.IOException; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.stanbol.enhancer.nlp.model.AnalysedText; +import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory; +import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils; +import org.apache.stanbol.enhancer.servicesapi.Blob; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; +import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for {@link EnhancementEngine} implementations that + * do use the {@link AnalysedText} content part + * @author Rupert Westenthaler + * + */ +public final class NlpEngineHelper { + + private static final Logger log = LoggerFactory.getLogger(NlpEngineHelper.class); + + private NlpEngineHelper(){} + + + /** + * Getter for the AnalysedText for a ContentItem + * @param engine the EnhancementEngine calling this method (used for logging) + * @param ci the ContentItem + * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)} + * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)} + * @return the AnalysedText or <code>null</code> if not found. + * @throws IllegalStateException if exception is <code>true</code> and the + * {@link AnalysedText} could not be retrieved from the parsed {@link ContentItem}. + */ + public static AnalysedText getAnalysedText(EnhancementEngine engine, ContentItem ci, boolean exception) { + AnalysedText at; + try { + at = AnalysedTextUtils.getAnalysedText(ci); + }catch (RuntimeException e) { + log.warn("Unable to retrieve AnalysedText for ContentItem " + + ci + "because of an "+e.getClass().getSimpleName()+" with message " + + e.getMessage(),e); + at = null; + } + if(at != null){ + return at; + } + if(exception){ + throw new IllegalStateException("Unable to retrieve AnalysedText from ContentItem " + + ci+". As this is also checked in canEnhancer this may indicate an Bug in the " + + "used EnhancementJobManager!"); + } else { + log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance " + + "ContentItem {} because the AnalysedText ContentPart is " + + "missing. Users might want to add an EnhancementEngine that " + + "creates the AnalysedText ContentPart such as the " + + "POSTaggingEngine (o.a.stanbol.enhancer.engines.opennlp.pos)!", + new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci}); + return null; + } + } + + /** + * Getter for the language of the content + * @param ci the ContentItem + * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)} + * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)} + * @return the AnalysedText or <code>null</code> if not found. + * @throws IllegalStateException if exception is <code>true</code> and the + * language could not be retrieved from the parsed {@link ContentItem}. + */ + public static String getLanguage(EnhancementEngine engine, ContentItem ci, boolean exception) { + String language = EnhancementEngineHelper.getLanguage(ci); + if(language != null) { + return language; + } + if(exception){ + throw new IllegalStateException("Unable to retrieve the detected language for ContentItem " + + ci+". As this is also checked in canEnhancer this may indicate an Bug in the " + + "used EnhancementJobManager!"); + } else { + log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance " + + "ContentItem {} because the langauge of " + + "this ContentItem is unknown. Users might want to add a " + + "Language Identification EnhancementEngine to the current " + + "EnhancementChain!", + new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci}); + return null; + } + } + /** + * Retrieves - or if not present - creates the {@link AnalysedText} content + * part for the parsed {@link ContentItem}. If the {@link Blob} with the + * mime type '<code>text/plain</code>' is present this method + * throws an {@link IllegalStateException} (this method internally uses + * {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with + * <code>true</code> as third parameters. Users of this method should call + * this method with <code>false</code> as third parameter in their + * {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p> + * <i>NOTE:</i> This method is intended for Engines that want to create an + * empty {@link AnalysedText} content part. Engines that assume that this + * content part is already present (e.g. if the consume already existing + * annotations) should use the + * {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)} + * method instead. + * @param engine the EnhancementEngine calling this method (used for logging) + * @param analysedTextFactory the {@link AnalysedTextFactory} used to create + * the {@link AnalysedText} instance (if not present). + * @param ci the {@link ContentItem} + * @return the AnalysedText + * @throws EngineException on any exception while accessing the + * '<code>text/plain</code>' Blob + * @throws IllegalStateException if no '<code>text/plain</code>' Blob is + * present as content part of the parsed {@link ContentItem}. NOTE that if + * the {@link AnalysedText} content part is already present no Exception will + * be thrown even if no plain text {@link Blob} is present in the parsed + * {@link ContentItem} + */ + public static AnalysedText initAnalysedText(EnhancementEngine engine, + AnalysedTextFactory analysedTextFactory, + ContentItem ci) throws EngineException { + AnalysedText at = AnalysedTextUtils.getAnalysedText(ci); + if(at == null){ + Entry<UriRef,Blob> textBlob = getPlainText(engine, ci, true); + log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName()); + try { + at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue()); + } catch (IOException e) { + throw new EngineException("Unable to create AnalysetText instance for Blob " + + textBlob.getKey()+ " of ContentItem "+ci.getUri()+"!",e); + } + } else { + log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName()); + } + return at; + } + + /** + * Getter for the language of the content + * @param ci the ContentItem + * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)} + * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)} + * @return the AnalysedText or <code>null</code> if not found. + * @throws IllegalStateException if exception is <code>true</code> and the + * language could not be retrieved from the parsed {@link ContentItem}. + */ + public static Entry<UriRef,Blob> getPlainText(EnhancementEngine engine, ContentItem ci, boolean exception) { + Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob( + ci, singleton("text/plain")); + if(textBlob != null) { + return textBlob; + } + if(exception){ + throw new IllegalStateException("Unable to retrieve 'text/plain' ContentPart for ContentItem " + + ci+". As this is also checked in canEnhancer this may indicate an Bug in the " + + "used EnhancementJobManager!"); + } else { + log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance " + + "ContentItem {} because no 'text/plain' ContentPart is " + + "present in this ContentItem. Users that need to enhance " + + "non-plain-text Content need to add an EnhancementEngine " + + "that supports the conversion of '{}' files to plain text " + + "to the current EnhancementChain!", + new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci,ci.getMimeType()}); + return null; + } + } + +}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java Wed Sep 19 08:48:32 2012 @@ -0,0 +1,403 @@ +package org.apache.stanbol.enhancer.nlp.model; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNotNull; +import static junit.framework.Assert.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map.Entry; +import java.util.Set; + +import junit.framework.Assert; + +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory; +import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum; +import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation; +import org.apache.stanbol.enhancer.nlp.model.annotation.Value; +import org.apache.stanbol.enhancer.servicesapi.Blob; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; +import org.apache.stanbol.enhancer.servicesapi.impl.StringSource; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The Class added as ContentPart to the contentItem + * @author westei + * + */ +public class AnalysedTextTest { + + private static Logger log = LoggerFactory.getLogger(AnalysedTextTest.class); + + public static final String text = "The Stanbol enhancer can detect famous " + + "cities such as Paris and people such as Bob Marley. With " + + "disambiguation it would even be able to detect the Comedian " + + "Bob Marley trafeling to Paris in Texas."; + + public static final Annotation<String,Number> testAnnotation = + new Annotation<String,Number>("test", Number.class); + + /* ----- + * Test data creates within the BeforeClass + * ----- + */ + /** + * AnalysedText instance filled in {@link #setup()} with test dats + */ + private static AnalysedText analysedTextWithData; + private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>(); + private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>(); + private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>(); + + /* ----- + * Test data creates before every single test + * ----- + */ + /** + * Empty AnalysedText instance created before each test + */ + private static AnalysedText at; + + private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance(); + private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance(); + + private static ContentItem ci; + + @BeforeClass + public static final void setup() throws IOException { + analysedTextWithData = createAnalysedText(); + int sentence = text.indexOf('.')+1; + Sentence sent1 = analysedTextWithData.addSentence(0, sentence); + expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + + "cities such as Paris and people such as Bob Marley."); + + Sentence sent2 = analysedTextWithData.addSentence(sentence+1, text.length()); + expectedSentences.put(sent2, "With disambiguation it would even be able " + + "to detect the Comedian Bob Marley trafeling to Paris in Texas."); + + Token the = sent1.addToken(0, 3); + expectedTokens.put(the, "The"); + Token stanbol = sent1.addToken(4,11); + expectedTokens.put(stanbol, "Stanbol"); + //use index to create Tokens + int enhancerStart = sent1.getSpan().toString().indexOf("enhancer"); + Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length()); + expectedTokens.put(enhancer, "enhancer"); + + //create a chunk + Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd()); + expectedChunks.put(stanbolEnhancer, "Stanbol enhancer"); + + int parisStart = sent1.getSpan().toString().indexOf("Paris"); + Token paris = sent1.addToken(parisStart, parisStart+5); + expectedTokens.put(paris, "Paris"); + + int bobMarleyStart = sent1.getSpan().toString().indexOf("Bob Marley"); + Chunk bobMarley = sent1.addChunk(bobMarleyStart, bobMarleyStart+10); + expectedChunks.put(bobMarley, "Bob Marley"); + Token bob = bobMarley.addToken(0, 3); + expectedTokens.put(bob, "Bob"); + Token marley = bobMarley.addToken(4, 10); + expectedTokens.put(marley, "Marley"); + + Token with = sent2.addToken(0, 4); + expectedTokens.put(with, "With"); + Token disambiguation = sent2.addToken(5, 5+"disambiguation".length()); + expectedTokens.put(disambiguation, "disambiguation"); + + int comedianBobMarleyIndex = sent2.getSpan().toString().indexOf("Comedian"); + Chunk comedianBobMarley = sent2.addChunk(comedianBobMarleyIndex, + comedianBobMarleyIndex+"Comedian Bob Marley".length()); + expectedChunks.put(comedianBobMarley, "Comedian Bob Marley"); + Token comedian = comedianBobMarley.addToken(0, "Comedian".length()); + expectedTokens.put(comedian, "Comedian"); + Token bobSent2 = comedianBobMarley.addToken(9,9+"Bob".length()); + expectedTokens.put(bobSent2, "Bob"); + Token marleySent2 = comedianBobMarley.addToken(13, 13+"Marley".length()); + expectedTokens.put(marleySent2, "Marley"); + + int parisIndex = sent2.getSpan().toString().indexOf("Paris"); + Chunk parisInTexas = sent2.addChunk(parisIndex, parisIndex+"Paris in Texas".length()); + expectedChunks.put(parisInTexas, "Paris in Texas"); + Token parisSent2 = parisInTexas.addToken(0, "Paris".length()); + expectedTokens.put(parisSent2, "Paris"); + int inIndex = parisInTexas.getSpan().indexOf("in"); + Token in = parisInTexas.addToken(inIndex, + inIndex+2); + expectedTokens.put(in, "in"); + Token texasSent2 = parisInTexas.addToken(parisInTexas.getSpan().toString().indexOf("Texas"), + parisInTexas.getSpan().toString().indexOf("Texas")+"Texas".length()); + expectedTokens.put(texasSent2, "Texas"); + + } + + + @Before + public void initAnalysedText() throws Exception { + at = createAnalysedText(); + } + /** + * @throws IOException + */ + private static AnalysedText createAnalysedText() throws IOException { + ci = ciFactory.createContentItem(new StringSource(text)); + Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain")); + return atFactory.createAnalysedText(ci, textBlob.getValue()); + } + + + @Test + public void testSpanFilter(){ + Iterator<Sentence> sentences = analysedTextWithData.getSentences(); + Iterator<Chunk> chunks = analysedTextWithData.getChunks(); + Iterator<Token> tokens = analysedTextWithData.getTokens(); + for(Entry<Sentence,String> sentEntry : expectedSentences.entrySet()){ + Sentence sent = sentences.next(); + Assert.assertEquals(sentEntry.getKey(), sent); + Assert.assertEquals(sentEntry.getValue(), sent.getSpan().toString()); + } + for(Entry<Chunk,String> chunkEntry : expectedChunks.entrySet()){ + Chunk chunk = chunks.next(); + Assert.assertEquals(chunkEntry.getKey(), chunk); + Assert.assertEquals(chunkEntry.getValue(), chunk.getSpan().toString()); + } + for(Entry<Token,String> tokenEntry : expectedTokens.entrySet()){ + Token token = tokens.next(); + Assert.assertEquals(tokenEntry.getKey(), token); + Assert.assertEquals(tokenEntry.getValue(), token.getSpan().toString()); + } + } + + @Test + public void testAnalysedText(){ + Assert.assertEquals(text, at.getText()); + Assert.assertEquals(text, at.getSpan()); + Assert.assertEquals(0, at.getStart()); + Assert.assertEquals(text.length(), at.getEnd()); + } + /** + * Spans created relative to an other MUST NOT exceed the span of the + * other one + */ + @Test(expected=IllegalArgumentException.class) + public void testExceedsRelativeSpan(){ + Sentence sent = at.addSentence(0, 10); + sent.addChunk(5, 15); //Invalid + } + + @Test(expected=IllegalArgumentException.class) + public void testNegativeStart(){ + at.addSentence(-1, 10); + } + + @Test(expected=IllegalArgumentException.class) + public void testRelativeNegativeStart(){ + Sentence sent = at.addSentence(0, 10); + sent.addToken(-1, 5); + } + @Test + public void testAnalysedTextaddSpanMethods(){ + Collection<Span> spans = new HashSet<Span>(); + //add some span of different types + spans.add(at.addToken(4, 11)); + spans.add(at.addChunk(4,19)); + spans.add(at.addSentence(0, 91)); + Set<Span> atSpans = AnalysedTextUtils.asSet(at.getEnclosed(EnumSet.allOf(SpanTypeEnum.class))); + Assert.assertTrue(spans.containsAll(atSpans)); + Assert.assertTrue(atSpans.containsAll(spans)); + } + /** + * Test relative additions (with relative indexes) as well as iterators + * over this hierarchy + */ + @Test + public void testSpanHierarchy(){ + int[] startPos = new int[]{0,1,2}; + int[] endPos = new int[]{1,2,3}; + int maxVal = endPos[endPos.length-1]; + int tokenLength = 5; + int chunkLength = tokenLength*maxVal; + int sentenceLength = tokenLength*maxVal*maxVal; + List<Sentence> sentences = new ArrayList<Sentence>(startPos.length); + List<Chunk> chunks = new ArrayList<Chunk>(startPos.length*2); + List<Token> tokens = new ArrayList<Token>(startPos.length*3); + int start; + int end; + //1. test relative add and absolute start/end + log.info("--- adding Spans ---"); + for(int s=0;s<startPos.length;s++){ + start = startPos[s]*sentenceLength; + end = endPos[s]*sentenceLength; + Sentence sent = at.addSentence(start, end); + log.info("add {}",sent); + assertEquals(start, sent.getStart()); + assertEquals(end, sent.getEnd()); + sentences.add(sent); + } + //1.b iterate over the sentences while adding Chunks and Tokens to + // test that returned Iterators MUST NOT throw + // ConcurrentModificationExceptions when adding Spans to the AnalysedText + Iterator<Sentence> sentenceIt = at.getSentences(); + while(sentenceIt.hasNext()){ + Sentence sent = sentenceIt.next(); + for(int c=0;c<startPos.length;c++){ + start = startPos[c]*chunkLength; + end = endPos[c]*chunkLength; + Chunk chunk = sent.addChunk(start, end); + log.info(" add {}",chunk); + start = sent.getStart() + start; + end = sent.getStart() + end; + assertEquals(start, chunk.getStart()); + assertEquals(end, chunk.getEnd()); + chunks.add(chunk); + for(int t=0;t<startPos.length;t++){ + start = startPos[t]*tokenLength; + end = endPos[t]*tokenLength; + Token token = chunk.addToken(start, end); + log.info(" add {}",token); + start = chunk.getStart() + start; + end = chunk.getStart() + end; + assertEquals(start, token.getStart()); + assertEquals(end, token.getEnd()); + tokens.add(token); + } + } + } + //2. test iterations of enclosed + int chunksInSentence = startPos.length; + int tokensInChunk = chunksInSentence; + int tokensInSentence = chunksInSentence*tokensInChunk; + Iterator<Sentence> sentIt = at.getSentences(); + int s = 0; + int c = 0; + int t = 0; + log.info("--- iterating over Spans ---"); + log.info("{}",at); + for(;sentIt.hasNext();s++){ + assertTrue(sentences.size()+" Sentences Expected (found: "+(s+1)+")",s < sentences.size()); + Sentence sent = sentIt.next(); + log.info(" {}",sent); + assertEquals(sentences.get(s), sent); + Iterator<Chunk> chunkIt = sent.getChunks(); + int foundChunks = 0; + for(;chunkIt.hasNext();c++){ + assertTrue(chunks.size()+" Chunks Expected (found: "+(c+1)+")",c < chunks.size()); + Chunk chunk = chunkIt.next(); + log.info(" {}",chunk); + assertEquals(chunks.get(c), chunk); + Iterator<Token> tokenIt = chunk.getTokens(); + int foundTokens = 0; + for(;tokenIt.hasNext();t++){ + assertTrue(tokens.size()+" Tokens Expected (found: "+(t+1)+")",t < tokens.size()); + Token token = tokenIt.next(); + log.info(" {}",token); + assertEquals(tokens.get(t), token); + foundTokens++; + } + assertEquals(tokensInChunk+" Tokens expected in Chunk", tokensInChunk,foundTokens); + foundChunks++; + } + assertEquals(chunksInSentence+" Chunks expected in Sentence", chunksInSentence,foundChunks); + //also iterate over tokens within a sentence + log.info(" {}",sent); + Iterator<Token> tokenIt = sent.getTokens(); + int foundTokens = 0; + for(;tokenIt.hasNext();foundTokens++){ + Token token = tokenIt.next(); + log.info(" {}",token); + assertEquals(tokens.get(s*tokensInSentence+foundTokens), token); + } + assertEquals(tokensInSentence+" Tokens expected in Sentence", tokensInSentence,foundTokens); + } + assertEquals(sentences.size()+" Sentences Expected (found: "+s+")", sentences.size(),s); + assertEquals(chunks.size()+" Chunks Expected (found: "+c+")", chunks.size(),c); + assertEquals(tokens.size()+" Sentences Expected (found: "+t+")", tokens.size(),t); + //also iterate over Chunks in AnalysedText + Iterator<Chunk> chunkIt = at.getChunks(); + int foundChunks = 0; + log.info("{}",at); + for(;chunkIt.hasNext();foundChunks++){ + Chunk chunk = chunkIt.next(); + log.info(" {}",chunk); + assertEquals(chunks.get(foundChunks), chunk); + } + assertEquals(chunks.size()+" Chunks expected in AnalysedText", chunks.size(),foundChunks); + //also iterate over Tokens in AnalysedText + Iterator<Token> tokenIt = at.getTokens(); + int foundTokens = 0; + log.info("{}",at); + for(;tokenIt.hasNext();foundTokens++){ + Token token = tokenIt.next(); + log.info(" {}",token); + assertEquals(tokens.get(foundTokens), token); + } + assertEquals(tokens.size()+" Tokens expected in AnalysedText", tokens.size(),foundTokens); + + //Finally iterate over multiple token types + Iterator<Span> sentencesAndChunks = at.getEnclosed( + EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk)); + s=0; + c=0; + log.info("{} >> Iterate over Sentences and Chunks",at); + while(sentencesAndChunks.hasNext()){ + Span span = sentencesAndChunks.next(); + log.info(" {}",span); + if(span.getType() == SpanTypeEnum.Chunk){ + assertEquals(chunks.get(c), span); + c++; + } else if(span.getType() == SpanTypeEnum.Sentence){ + assertEquals(sentences.get(s), span); + s++; + } else { + Assert.fail("Unexpected SpanType '"+span.getType()+" (Span: "+span.getClass()+")"); + } + } + assertEquals(sentences.size()+" Sentences expected in AnalysedText", sentences.size(),s); + assertEquals((sentences.size()*chunksInSentence)+" Chunks expected in AnalysedText", + (sentences.size()*chunksInSentence),c); + } + + @Test + public void testAnnotation(){ + List<Value<Number>> values = new ArrayList<Value<Number>>(); + values.add(new Value<Number>(26,0.6)); + values.add(new Value<Number>(27l)); + values.add(new Value<Number>(28.0f)); + values.add(new Value<Number>(25.0,0.8)); + at.addAnnotations(testAnnotation, values); + Value<Number> value = at.getAnnotation(testAnnotation); + assertNotNull(value); + assertEquals(Double.valueOf(25.0), value.value()); + assertEquals(0.8d, value.probability()); + Number prev = Float.valueOf(24f); + for(Value<Number> v : at.getAnnotations(testAnnotation)){ + assertNotNull(v); + assertTrue(v.value().doubleValue() > prev.doubleValue()); + prev = v.value(); + } + //check that the order of Annotations without probability is kept + at.addAnnotation(testAnnotation, new Value<Number>(29)); + prev = Integer.valueOf(24); + for(Value<Number> v : at.getAnnotations(testAnnotation)){ + assertNotNull(v); + assertTrue(v.value().intValue() > prev.intValue()); + prev = v.value(); + } + + } + +} Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java Wed Sep 19 08:48:32 2012 @@ -0,0 +1,40 @@ +package org.apache.stanbol.enhancer.nlp.utils; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.Charset; + +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; +import org.junit.Test; + +import junit.framework.Assert; + +public class NIFHelperTest { + + static UriRef base = new UriRef("http://stanbol.apache.org/test/nif/nif-helper"); + static String text = "This is a test for the NLP Interchange format!"; + + + @Test + public void testFragmentURI(){ + Assert.assertEquals( + new UriRef(base.getUnicodeString()+"#char=23,26"), + NIFHelper.getNifFragmentURI(base, 23, 26)); + } + @Test + public void testOffsetURI(){ + Assert.assertEquals( + base.getUnicodeString()+"#offset_23_26", + NIFHelper.getNifOffsetURI(base, 23, 26).getUnicodeString()); + } + @Test + public void testHashURI() throws IOException { + String selected = text.substring(23,26); + String context = text.substring(13,23)+'('+selected+')'+text.substring(26,36); + byte[] contextData = context.getBytes(Charset.forName("UTF8")); + String md5 = ContentItemHelper.streamDigest(new ByteArrayInputStream(contextData), null, "MD5"); + UriRef expected = new UriRef(base.getUnicodeString()+"#hash_10_3_"+md5+"_NLP"); + Assert.assertEquals(expected, NIFHelper.getNifHashURI(base, 23, 26, text)); + } +} Modified: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml?rev=1387488&r1=1387487&r2=1387488&view=diff ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml (original) +++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml Wed Sep 19 08:48:32 2012 @@ -54,6 +54,7 @@ <module>generic/core</module> <module>generic/test</module> <module>generic/rdfentities</module> + <module>generic/nlp</module> <module>jobmanager</module> <module>chain/allactive</module> @@ -61,6 +62,8 @@ <module>chain/weighted</module> <module>chain/list</module> + <module>engines</module> + <module>jersey</module> <module>ldpath</module> <module>benchmark</module> Propchange: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/ ------------------------------------------------------------------------------ --- svn:ignore (added) +++ svn:ignore Wed Sep 19 08:48:32 2012 @@ -0,0 +1,7 @@ +.project + +.settings + +target + +.classpath Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml Wed Sep 19 08:48:32 2012 @@ -0,0 +1,212 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.stanbol</groupId> + <artifactId>stanbol-parent</artifactId> + <version>2-incubating-SNAPSHOT</version> + <relativePath>../../parent</relativePath> + </parent> + + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.launchers.enhancer-nlp</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <packaging>jar</packaging> + + <name>Apache Stanbol Launchers for the NLP processing branch</name> + <description> + Runnable jar configured to test engines included in the + NLP processing branch (STANBOL-733) + </description> + + <scm> + <url>http://incubator.apache.org/stanbol/</url> + </scm> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-clean-plugin</artifactId> + <configuration> + <filesets> + <fileset> + <directory>.</directory> + <includes> + <include>stanbol/**</include> + </includes> + </fileset> + </filesets> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.sling</groupId> + <artifactId>maven-launchpad-plugin</artifactId> + <!-- + TODO the maven-launchpad-plugin can also generate a war file and + Karaf description, we could add this. See + http://sling.apache.org/site/maven-launchpad-plugin.html + --> + <executions> + <execution> + <id>prepare-package</id> + <goals> + <goal>prepare-package</goal> + <goal>attach-bundle-list</goal> + </goals> + <configuration> + <includeDefaultBundles>false</includeDefaultBundles> + <!-- Standalone jar requires an OSGi http service implementation --> + <jarWebSupport> + <groupId>org.apache.felix</groupId> + <artifactId>org.apache.felix.http.jetty</artifactId> + <version>2.2.0</version> + </jarWebSupport> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <configuration> + <archive> + <manifest> + <!-- make the generated jar runnable --> + <addClasspath>true</addClasspath> + <mainClass>org.apache.stanbol.launchpad.Main</mainClass> + <addDefaultImplementationEntries>true</addDefaultImplementationEntries> + </manifest> + </archive> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>1.2</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <artifactSet> + <!-- Use this to in/exclude only specific dependencies --> + <includes> + <include>org.apache.stanbol:org.apache.stanbol.launchpad</include> + </includes> + </artifactSet> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ComponentsXmlResourceTransformer" /> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <excludes> + </excludes> + </configuration> + </plugin> + </plugins> + </build> + + <dependencies> + <dependency> + <!-- The Apache Stanbol lauchpad --> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.launchpad</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + </dependency> + <dependency> + <!-- maven-launchpad-plugin builds on the launchpad.base app --> + <groupId>org.apache.sling</groupId> + <artifactId>org.apache.sling.launchpad.base</artifactId> + <classifier>app</classifier> + <scope>provided</scope> + </dependency> + + <!-- OSGi Framemework Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.launchers.bundlelists.osgiframework</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + + <!-- Stanbol Commons Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.launchers.bundlelists.stanbolcommons</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + + <!-- Stanbol Enhancer Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.enhancer.bundlelist</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + + <!-- Stanbol Data Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.data.bundlelists.defaultdata</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + <!-- OpenNLP Data Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.data.bundlelists.opennlp</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + <!-- Sentiment Data Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.data.bundlelists.sentiment</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + + <!-- Stanbol Entityhub Bundle List --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.entityhub.bundlelist</artifactId> + <version>0.11.0-incubating-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> + </dependency> + </dependencies> + +</project> Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml Wed Sep 19 08:48:32 2012 @@ -0,0 +1,35 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + <!-- + List of initial bundles for the Stanbol Sling-based standalone launcher. + --> +<bundles> + <!-- General-purpose libraries --> + + <!-- ********************************************************************* + start level 20 TO 24 reserved for Stanbol Framework + (Enhancer, Entityhub, Contenthub, Factstore ... incl. Web Fragments) + ********************************************************************* --> + + + + <!-- ********************************************************************* + start level >= 30 are unused + ********************************************************************* --> + +</bundles> Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties Wed Sep 19 08:48:32 2012 @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# This file is loaded by Apache Sling during startup. Properties defined +# in this file are copied over to the sling.properties file in the {sling.home} +# directory. + +# The stanbol home directory +# by default this is set to the same value as sling.home +stanbol.home=${sling.home} +org.osgi.framework.startlevel.beginning=40 Added: incubator/stanbol/branches/stanbol-nlp-processing/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/pom.xml?rev=1387488&view=auto ============================================================================== --- incubator/stanbol/branches/stanbol-nlp-processing/pom.xml (added) +++ incubator/stanbol/branches/stanbol-nlp-processing/pom.xml Wed Sep 19 08:48:32 2012 @@ -0,0 +1,33 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + You under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.stanbolnlpprocessing.reactor</artifactId> + <version>0.10.0-incubating-SNAPSHOT</version> + <packaging>pom</packaging> + + <name>Apache Stanbol NLP Processing Branch Reactor</name> + <description> + Dummy reactor to compile all modules in the Stanbol NLP processing branch (STANBOL-733) + </description> + + <modules> + <module>data</module> + <module>enhancer</module> + <module>nlp-launcher</module> + </modules> + +</project>