Added: stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java?rev=1455131&view=auto ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java (added) +++ stanbol/branches/stanbol-solr4/enhancement-engines/kuromoji-nlp/src/test/java/org/apache/stanbol/enhancer/engines/kuromoji/impl/TestKuromojiNlpEngine.java Mon Mar 11 13:18:59 2013 @@ -0,0 +1,138 @@ +package org.apache.stanbol.enhancer.engines.kuromoji.impl; + +import java.io.IOException; +import java.util.Dictionary; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; + +import org.apache.clerezza.rdf.core.LiteralFactory; +import org.apache.clerezza.rdf.core.Resource; +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; +import org.apache.clerezza.rdf.core.impl.TripleImpl; +import org.apache.stanbol.commons.solr.utils.DataFileResourceLoader; +import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider; +import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory; +import org.apache.stanbol.enhancer.nlp.NlpAnnotations; +import org.apache.stanbol.enhancer.nlp.model.AnalysedText; +import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory; +import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils; +import org.apache.stanbol.enhancer.nlp.model.Chunk; +import org.apache.stanbol.enhancer.nlp.model.Sentence; +import org.apache.stanbol.enhancer.nlp.model.Token; +import org.apache.stanbol.enhancer.nlp.model.annotation.Value; +import org.apache.stanbol.enhancer.nlp.ner.NerTag; +import org.apache.stanbol.enhancer.nlp.pos.PosTag; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.impl.StringSource; +import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; +import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.osgi.service.cm.ConfigurationException; + +public class TestKuromojiNlpEngine { + public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME"; + + private static DataFileProvider dataFileProvider; + + private static ContentItemFactory contentItemFactory; + + private static UriRef id = new UriRef("http://www.example.org/contentItem1"); + /** + * Test text taken from the <a href ="http://ja.wikipedia.org/wiki/%E3%83%AD%E3%83%B3%E3%83%89%E3%83%B3"> + * Japanese wikipedia side for London</a>. + */ + private static String text = "ãã³ãã³ã¯ã¤ã³ã°ã©ã³ãããã³ã¤ã®ãªã¹ã®é¦é½ã§ãããã¤ã®ãªã¹ã欧å·"+ + "é£ååå ã§æå¤§ã®é½å¸åãå½¢æãã¦ããããã³ãã³ã¯ãã ãºå·æ²³çã«ä½ç½®ãã2,000å¹´åã®ãã¼ãå¸å½"+ + "ã«ãããã³ãã£ãã¦ã åµå»ºãé½å¸ã®èµ·æºã§ããããã³ãã£ãã¦ã 彿ã®è¡ã®ä¸å¿é¨ã¯ãç¾å¨ã®ã·ãã£ã»"+ + "ãªãã»ãã³ãã³ï¼ã·ãã£ï¼ã«å½ããå°åã«ãã£ããã·ãã£ã®å¸è¡å£å ã®é¢ç©ã¯ç´1å¹³æ¹ãã¤ã«ããã"+ + "ä¸ä¸ä»¥æ¥ãã®ç¯å²ã¯ã»ã¼å¤ãã£ã¦ããªããå°ãªãã¨ã19ä¸ç´ä»¥éãããã³ãã³ãã®åç§°ã¯ã·ãã£ã®å¸"+ + "è¡å£ãè¶ãã¦éçºãé²ãã ã·ãã£å¨è¾ºå°åããå«ãã¦ç¨ãããã¦ããã ãã³ãã³ã¯å¸è¡å°ã®å¤§é¨åã¯"+ + "ã³ããã¼ã·ã§ã³ã«ããå½¢æããã¦ããããã³ãã³ã管è½ãããªã¼ã¸ã§ã³ã§ããã°ã¬ã¼ã¿ã¼ã»ãã³ãã³ã§ã¯"+ + "ã鏿ã§é¸åºããã大ãã³ãã³å¸é·ã¨ãã³ãã³è°ä¼ã«ãã統治ãè¡ããã¦ããã"; + + private KuromojiNlpEngine engine; + + private ContentItem contentItem; + + @BeforeClass + public static void initDataFileProvicer(){ + dataFileProvider = new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME); + contentItemFactory = InMemoryContentItemFactory.getInstance(); + } + + @Before + public void setUpServices() throws IOException , ConfigurationException { + engine = new KuromojiNlpEngine(); + //we need to set some fields that would otherwise be injected by the container + engine.parentResourceLoader = new DataFileResourceLoader(dataFileProvider); + engine.analysedTextFactory = AnalysedTextFactory.getDefaultInstance(); + Dictionary<String,Object> config = new Hashtable<String,Object>(); + config.put(EnhancementEngine.PROPERTY_NAME, "gosen-nlp"); + engine.activate(new MockComponentContext(config)); + contentItem = contentItemFactory.createContentItem(id, new StringSource(text)); + //add an annotation that this is Japanese + contentItem.getMetadata().add(new TripleImpl(id, Properties.DC_LANGUAGE, + new PlainLiteralImpl("ja"))); + } + + @Test + public void testEngine() throws EngineException { + LiteralFactory lf = LiteralFactory.getInstance(); + Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem)); + engine.computeEnhancements(contentItem); + //assert the results + Map<UriRef,Resource> expected = new HashMap<UriRef,Resource>(); + expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName())); + expected.put(Properties.ENHANCER_EXTRACTED_FROM,contentItem.getUri()); + Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations( + contentItem.getMetadata(), text, expected)); + AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem); + Assert.assertNotNull(at); + List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences()); + Assert.assertNotNull(sentences); + Assert.assertEquals(7, sentences.size()); + //TODO: values in the following arrays are based on the first run of the + // engine. So this is only to detect changes in results. It can not validate + // that the tokenization and NER detections are correct - sorry I do not + // speak Japanese ... + int[] expectedChunks = new int[]{ 5, 3, 1, 0, 1, 2, 4}; + int[] expectedTokens = new int[]{ 25, 25, 25, 24, 33, 17, 32}; + int sentIndex = 0; + for(Sentence sent : sentences){ + List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks()); + Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size()); + for(Chunk chunk : sentenceNer){ + Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION); + Assert.assertNotNull(nerValue); + Assert.assertNotNull(nerValue.value().getType()); + } + List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens()); + Assert.assertEquals(expectedTokens[sentIndex], tokens.size()); + for(Token token : tokens){ + Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION); + Assert.assertNotNull(posValue); + } + sentIndex++; + } + } + + + @After + public void cleanUpServices(){ + if(engine != null){ + engine.deactivate(null); + } + engine = null; + } + +}
Modified: stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/pom.xml Mon Mar 11 13:18:59 2013 @@ -95,6 +95,17 @@ <artifactId>org.apache.stanbol.commons.solr.extras.paoding</artifactId> <version>0.11.0</version> </dependency> + <!-- paoding does not support solr4 yet --> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>3.6.1</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers</artifactId> + <version>3.6.1</version> + </dependency> <dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.enhancer.nlp</artifactId> Modified: stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java Mon Mar 11 13:18:59 2013 @@ -154,6 +154,7 @@ public class PaodingTokenizerEngine exte TokenStream ts = pa.tokenStream("dummy", new CharSequenceReader(at.getText())); int lastEnd = 0; try { + ts.reset(); while(ts.incrementToken()){ OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //when tokenizing labels we need to preserve all chars Modified: stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/pom.xml Mon Mar 11 13:18:59 2013 @@ -64,6 +64,9 @@ <module>paoding-token</module> <!-- tokenizing --> <module>nlp2rdf</module> <!-- converts AnalyzedText ContentPart to RDF --> + <!-- Japanese NLP processing --> + <module>kuromoji-nlp</module> + <!-- RESTful NLP analyser service engine--> <module>restful-nlp</module> <!-- see STANBOL-893 --> <module>restful-langident</module> <!-- see STANBOL-895 --> Modified: stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/pom.xml Mon Mar 11 13:18:59 2013 @@ -108,11 +108,15 @@ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId> <version>0.10.0</version> </dependency> - <dependency> + <dependency> <!-- for tracking and loading sentiment wordlists --> <groupId>org.apache.stanbol</groupId> - <artifactId>org.apache.stanbol.commons.solr.core</artifactId> + <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId> <version>0.11.0</version> </dependency> + <dependency><!-- for stemming English words --> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + </dependency> <dependency> <groupId>org.apache.felix</groupId> <artifactId>org.apache.felix.scr.annotations</artifactId> Modified: stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java Mon Mar 11 13:18:59 2013 @@ -166,7 +166,7 @@ public class SentiWordNet { private ReadWriteLock lock = new ReentrantReadWriteLock(); private Map<String,Double> wordMap = new TreeMap<String,Double>(); - private EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); + private org.apache.lucene.analysis.en.EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); protected SentiWordNetClassifierEN() {} Modified: stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/pom.xml Mon Mar 11 13:18:59 2013 @@ -86,7 +86,7 @@ <dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.commons.solr.extras.smartcn</artifactId> - <version>0.11.0</version> + <version>0.12.0-SNAPSHOT</version> </dependency> <dependency> <groupId>org.apache.stanbol</groupId> Modified: stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnSentenceEngine.java Mon Mar 11 13:18:59 2013 @@ -20,6 +20,7 @@ import static org.apache.stanbol.enhance import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText; import java.io.IOException; +import java.io.StringReader; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; @@ -152,6 +153,7 @@ public class SmartcnSentenceEngine exten //first the sentences TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText())); try { + sentences.reset(); while(sentences.incrementToken()){ OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class); Sentence s = at.addSentence(offset.startOffset(), offset.endOffset()); @@ -203,6 +205,7 @@ public class SmartcnSentenceEngine exten private Sentence sentence = null; protected AnalyzedTextSentenceTokenizer(AnalysedText at) { + super(new StringReader(at.getText().toString())); this.at = at; sentences = at.getSentences(); } Modified: stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/smartcn-token/src/main/java/org/apache/stanbol/enhancer/engines/smartcn/impl/SmartcnTokenizerEngine.java Mon Mar 11 13:18:59 2013 @@ -20,6 +20,7 @@ import static org.apache.stanbol.enhance import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText; import java.io.IOException; +import java.io.StringReader; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; @@ -170,6 +171,7 @@ public class SmartcnTokenizerEngine exte //now the tokens TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at)); try { + tokens.reset(); while(tokens.incrementToken()){ OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class); Token t = at.addToken(offset.startOffset(), offset.endOffset()); @@ -219,6 +221,7 @@ public class SmartcnTokenizerEngine exte private Sentence sentence = null; protected AnalyzedTextSentenceTokenizer(AnalysedText at) { + super(new StringReader(at.getText().toString())); this.at = at; sentences = at.getSentences(); } Modified: stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml (original) +++ stanbol/branches/stanbol-solr4/enhancement-engines/topic/engine/pom.xml Mon Mar 11 13:18:59 2013 @@ -218,7 +218,6 @@ <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpcore-osgi</artifactId> - <version>4.0.1</version> <scope>test</scope> </dependency> <dependency> @@ -294,15 +293,16 @@ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId> <version>0.10.0</version> </dependency> - <dependency> + <!-- anyway transitive dependency of managed + <dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.commons.solr.core</artifactId> - <version>0.11.0</version> - </dependency> + <version>0.12.0-SNAPSHOT</version> + </dependency> --> <dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.commons.solr.managed</artifactId> - <version>0.11.0</version> + <version>0.12.0-SNAPSHOT</version> </dependency> </dependencies> Modified: stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java (original) +++ stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/AssertEntityhubJson.java Mon Mar 11 13:18:59 2013 @@ -63,6 +63,7 @@ public class AssertEntityhubJson { * {@link QueryTestCase#getExpectedStatus()} is a 2xx status code. */ public static void assertQueryResults(RequestExecutor re, QueryTestCase test) throws JSONException{ + log.debug("Assert Query Results for test {}",test.getContent()); re.assertStatus(test.getExpectedStatus()); re.assertContentType("application/json"); //currently only application/json is supported if(!test.expectsSuccess()){ Modified: stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java (original) +++ stanbol/branches/stanbol-solr4/entityhub/generic/test/src/main/java/org/apache/stanbol/entityhub/test/it/EntityhubTestBase.java Mon Mar 11 13:18:59 2013 @@ -97,6 +97,14 @@ public abstract class EntityhubTestBase referencedSite)); } } + //this ensures that all sites are initialized + for(String referencedSite : referencedSites){ + re = executor.execute( + builder.buildGetRequest("/entityhub/site/"+referencedSite + + "/entity?id=urn:does:not:exist:f82js95xsig39s.23987") + .withHeader("Accept", "application/json")); + re.assertStatus(404); + } log.info("Entityhub services checked, all present"); return true; }
