Modified: stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/spellings.txt URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/spellings.txt?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/spellings.txt (original) +++ stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/spellings.txt Mon Mar 11 13:18:59 2013 @@ -1,2 +1,2 @@ -pizza -history \ No newline at end of file +pizza +history
Modified: stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords.txt URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords.txt?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords.txt (original) +++ stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords.txt Mon Mar 11 13:18:59 2013 @@ -1,53 +1,14 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -#Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -s -such -t -that -the -their -then -there -these -they -this -to -was -will -with - +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. Added: stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords_en.txt URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords_en.txt?rev=1455131&view=auto ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords_en.txt (added) +++ stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/stopwords_en.txt Mon Mar 11 13:18:59 2013 @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with Modified: stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/synonyms.txt URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/synonyms.txt?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/synonyms.txt (original) +++ stanbol/branches/stanbol-solr4/entityhub/indexing/destination/solryard/src/test/resources/testConfigs/withSolrConf/indexing/config/simple/conf/synonyms.txt Mon Mar 11 13:18:59 2013 @@ -1,22 +1,29 @@ -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Some synonym groups specific to this example -GB,gib,gigabyte,gigabytes -MB,mib,megabyte,megabytes -Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming -#after us won't split it into two words. - -# Synonym mappings can be used for spelling correction too -pixima => pixma - +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + Modified: stanbol/branches/stanbol-solr4/entityhub/ldpath/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/ldpath/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/ldpath/pom.xml (original) +++ stanbol/branches/stanbol-solr4/entityhub/ldpath/pom.xml Mon Mar 11 13:18:59 2013 @@ -130,7 +130,7 @@ <dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId> - <version>0.11.0</version> + <version>0.12.0-SNAPSHOT</version> <scope>test</scope> </dependency> <dependency> Modified: stanbol/branches/stanbol-solr4/entityhub/yard/solr/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/yard/solr/pom.xml (original) +++ stanbol/branches/stanbol-solr4/entityhub/yard/solr/pom.xml Mon Mar 11 13:18:59 2013 @@ -105,12 +105,12 @@ <dependency> <!-- provides Solr and OSGI utilities for Solr --> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.commons.solr.core</artifactId> - <version>0.11.0</version> + <version>0.12.0-SNAPSHOT</version> </dependency> <dependency> <!-- provides managed Solr servers --> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.commons.solr.managed</artifactId> - <version>0.11.0</version> + <version>0.12.0-SNAPSHOT</version> </dependency> <!-- Stanbol Entityhub internal dependencies --> Modified: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java (original) +++ stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java Mon Mar 11 13:18:59 2013 @@ -515,15 +515,15 @@ public class SolrYard extends AbstractYa server = _registeredServerTracker.getService(); //TODO: remove and replace with a setting where the SolrYard does not // not activate until the SolrServer is available. - if(server == null){ - for(int i = 0;i<5;i++){//waiting for a maximum of 5sec - try { - log.info(" ... waiting 1sec for SolrServer"); - - server = (SolrServer)_registeredServerTracker.waitForService(1000); - } catch (InterruptedException e) {} - } - } +// if(server == null){ +// for(int i = 0;i<5;i++){//waiting for a maximum of 5sec +// try { +// log.info(" ... waiting 1sec for SolrServer"); +// +// server = (SolrServer)_registeredServerTracker.waitForService(1000); +// } catch (InterruptedException e) {} +// } +// } if(server != null && !server.equals(this._server)){ //reset the fieldMapper so that it is reinitialised for the new one //STANBOL-519 Modified: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java (original) +++ stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java Mon Mar 11 13:18:59 2013 @@ -27,9 +27,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerFactory; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.solr.analysis.ICUTokenizerFactory; -import org.apache.solr.analysis.TokenizerFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.stanbol.commons.solr.utils.SolrUtil; import org.apache.stanbol.entityhub.yard.solr.defaults.IndexDataTypeEnum; import org.apache.stanbol.entityhub.yard.solr.model.IndexValue; @@ -38,18 +41,14 @@ import org.apache.stanbol.entityhub.yard public final class QueryUtils { private QueryUtils() {} /** - * The {@link TokenizerFactory} used to create Tokens for parsed - * {@link IndexValue#getValue()} in case <code>false</code> is parsed for - * the tokenize property of {@link #encodeQueryValue(IndexValue, boolean)}. - * <p> - * Currently the {@link ICUTokenizerFactory} is used for Tokenizing. + * The {@link DefaultICUTokenizerConfig} */ - private final static TokenizerFactory tokenizerFactory = new ICUTokenizerFactory(); + private final static ICUTokenizerConfig tokenizerConfig = new DefaultICUTokenizerConfig(); /** * Regex patter that searches for Wildcard chars '*' and '?' excluding * escaped versions '\*' and '\?' */ - private final static Pattern wILDCARD_QUERY_CHAR_PATTERN = Pattern.compile("[^\\\\][\\*\\?]"); + private final static Pattern WILDCARD_QUERY_CHAR_PATTERN = Pattern.compile("[^\\\\][\\*\\?]"); /** * This method encodes a parsed index value as needed for queries. @@ -76,7 +75,7 @@ public final class QueryUtils { * instead * </ul> * - * @param value + * @param indexValue * the index value * @param escape if <code>true</code> all Solr special chars are escaped if * <code>false</code> than '*' and '?' as used for wildcard searches are @@ -182,8 +181,8 @@ public final class QueryUtils { private static String[] parseWildcardQueryTerms(String value,boolean loewercaseWildcardTokens) { //This assumes that the Tokenizer does tokenize '*' and '?', //what makes it a little bit tricky. - Tokenizer tokenizer = tokenizerFactory.create(new StringReader(value)); - Matcher m = wILDCARD_QUERY_CHAR_PATTERN.matcher(value); + Tokenizer tokenizer = new ICUTokenizer(new StringReader(value),tokenizerConfig); + Matcher m = WILDCARD_QUERY_CHAR_PATTERN.matcher(value); int next = m.find()?m.start()+1:-1; if(next < 0){ //No wildcard return new String[]{'"'+value+'"'}; @@ -194,6 +193,7 @@ public final class QueryUtils { boolean foundWildcard = false; //Lucene tokenizer are really low level ... try { + tokenizer.reset(); //starting with Solr4 reset MUST BE called before using while(tokenizer.incrementToken()){ //only interested in the start/end indexes of tokens OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class); Modified: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== Binary files - no diff available. Modified: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== Binary files - no diff available. Added: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/kuromoji.solrindex.zip URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/kuromoji.solrindex.zip?rev=1455131&view=auto ============================================================================== Binary file - no diff available. Propchange: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/kuromoji.solrindex.zip ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/paoding.solrindex.outdated URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/paoding.solrindex.outdated?rev=1455131&view=auto ============================================================================== Binary file - no diff available. Propchange: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/paoding.solrindex.outdated ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/smartcn.solrindex.zip URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/entityhub/yard/solr/src/main/resources/solr/core/smartcn.solrindex.zip?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== Binary files - no diff available. Modified: stanbol/branches/stanbol-solr4/integration-tests/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/integration-tests/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/integration-tests/pom.xml (original) +++ stanbol/branches/stanbol-solr4/integration-tests/pom.xml Mon Mar 11 13:18:59 2013 @@ -149,7 +149,7 @@ <dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.entityhub.test</artifactId> - <version>0.11.0</version> + <version>0.12.0-SNAPSHOT</version> </dependency> <dependency> <groupId>org.apache.stanbol</groupId> @@ -235,6 +235,17 @@ <groupId>org.apache.clerezza</groupId> <artifactId>rdf.rdfjson</artifactId> </dependency> + <!-- use log4j for logging --> + <dependency> <!-- used for debug level logging during tests --> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + <scope>test</scope> + </dependency> </dependencies> </project> Modified: stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/EnhancerTestBase.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/EnhancerTestBase.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/EnhancerTestBase.java (original) +++ stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/EnhancerTestBase.java Mon Mar 11 13:18:59 2013 @@ -187,6 +187,13 @@ public class EnhancerTestBase extends St .assertContentRegexp( "http:\\\\/\\\\/.*\\\\/entityhub\\\\/site\\\\/dbpedia\\\\/" ); + //also assert that the SolrYard for the dbpedia site is fully + //initialized + executor.execute( + builder.buildGetRequest("/entityhub/site/dbpedia" + + "/entity?id=urn:does:not:exist:f82js95xsig39s.23987") + .withHeader("Accept", "application/json")) + .assertStatus(404); log.info("Enhancement engines checked for '{}', all present", endpoint); return true; } Modified: stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/entityhub/it/EntityhubTest.java URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/entityhub/it/EntityhubTest.java?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/entityhub/it/EntityhubTest.java (original) +++ stanbol/branches/stanbol-solr4/integration-tests/src/test/java/org/apache/stanbol/entityhub/it/EntityhubTest.java Mon Mar 11 13:18:59 2013 @@ -117,6 +117,8 @@ public final class EntityhubTest extends testEntityUpdated(); testEntityDelete(); testEntityDeleted(); + testEntityDeleteAll(); + testAllEntitiesDeleted(); } private void testEntityCreation() throws IOException { InputStream in = EntityhubTest.class.getClassLoader().getResourceAsStream("doap_Stanbol.rdf"); @@ -198,6 +200,19 @@ public final class EntityhubTest extends .withHeader("Accept", "application/json")); re.assertStatus(404); } + private void testEntityDeleteAll() throws IOException { + Request request = builder.buildOtherRequest(new HttpDelete( + builder.buildUrl("/entityhub/entity", "id", "*"))); + RequestExecutor re = executor.execute(request); + re.assertStatus(200); + } + private void testAllEntitiesDeleted() throws IOException { + String id = "http://xml.apache.org/xerces-c/"; + RequestExecutor re = executor.execute( + builder.buildGetRequest("/entityhub/entity","id",id) + .withHeader("Accept", "application/json")); + re.assertStatus(404); + } @Test public void testEntityLookup() throws IOException, JSONException { String uri = "http://dbpedia.org/resource/Paris"; @@ -285,30 +300,36 @@ public final class EntityhubTest extends } private void testFindLimitAndOffsetQuery() throws IOException, JSONException { - FindQueryTestCase test = new FindQueryTestCase("XML*", + //With Solr4 we need a test that produces different scores for results, + //to ensure consistant odering + FindQueryTestCase test = new FindQueryTestCase("XML XSL*", Arrays.asList( - "http://xerces.apache.org/xml-commons/components/external/", - "http://xml.apache.org/xerces-c/", - "http://xerces.apache.org/xerces2-j/", - "http://xerces.apache.org/xerces-p", - "http://xerces.apache.org/xml-commons/components/resolver/"), + "http://velocity.apache.org/anakia/", + "http://xalan.apache.org/xalan-c/", + "http://xalan.apache.org/xalan-j/", + "http://velocity.apache.org/dvsl/devel/", + "http://xmlgraphics.apache.org/commons/", + "http://xmlgraphics.apache.org/fop"), null); - test.setField("http://usefulinc.com/ns/doap#name"); + test.setField("http://usefulinc.com/ns/doap#description"); + test.setLimit(10); test.setLanguage(null); executeQuery(test); //repeat the test with offset 2 and limit 2 to only retrieve the 3-4 result - test = new FindQueryTestCase("XML*", + test = new FindQueryTestCase("XML XSL*", Arrays.asList( - "http://xerces.apache.org/xml-commons/components/external/", - "http://xerces.apache.org/xerces-p"), + "http://xalan.apache.org/xalan-j/", + "http://velocity.apache.org/dvsl/devel/"), Arrays.asList( - "http://xml.apache.org/xerces-c/", - "http://xerces.apache.org/xerces2-j/", - "http://xerces.apache.org/xml-commons/components/resolver/")); - test.setField("http://usefulinc.com/ns/doap#name"); + "http://velocity.apache.org/anakia/", + "http://xalan.apache.org/xalan-c/", + "http://xmlgraphics.apache.org/commons/", + "http://xmlgraphics.apache.org/fop")); + test.setField("http://usefulinc.com/ns/doap#description"); test.setOffset(2); test.setLimit(2); test.setLanguage(null); + executeQuery(test); } Added: stanbol/branches/stanbol-solr4/integration-tests/src/test/resources/log4j.properties URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/integration-tests/src/test/resources/log4j.properties?rev=1455131&view=auto ============================================================================== --- stanbol/branches/stanbol-solr4/integration-tests/src/test/resources/log4j.properties (added) +++ stanbol/branches/stanbol-solr4/integration-tests/src/test/resources/log4j.properties Mon Mar 11 13:18:59 2013 @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n +# log4j.logger.org.apache.stanbol=DEBUG \ No newline at end of file Propchange: stanbol/branches/stanbol-solr4/integration-tests/src/test/resources/log4j.properties ------------------------------------------------------------------------------ svn:mime-type = text/plain Propchange: stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/ ------------------------------------------------------------------------------ --- svn:ignore (added) +++ svn:ignore Mon Mar 11 13:18:59 2013 @@ -0,0 +1,5 @@ +.project + +.settings + +target Added: stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/README.md URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/README.md?rev=1455131&view=auto ============================================================================== --- stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/README.md (added) +++ stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/README.md Mon Mar 11 13:18:59 2013 @@ -0,0 +1,77 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +Japanese language support based on Lucene Kuromoji Analyzer +============== + +This BundleList includes three modules that bring Japanese language support to Apache Stanbol. + +See comments in the [lists.xml](src/main/bundles/list.xml) for more details. + +Solr Field Configuration +--- + +When you plan to use this Analyzer to process Japanese texts it is important to also properly configure the Solr schema.xml used by the Entityhub SolrYard. + +For that you will need to add two things: + +1. A fieldType specification for Japanese + + :::xml + <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> + <analyzer> + <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> + <filter class="solr.JapaneseBaseFormFilterFactory"/> + <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> + <filter class="solr.CJKWidthFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> + <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + +2. A dynamic field using this field type that matches against Chinese language literals + + :::xml + <!-- + Dynamic field for Chinese languages. + --> + <dynamicField name="@ja*" type="text_ja" indexed="true" stored="true" multiValued="true" omitNorms="false"/> + +The [kuromoji.solrindex.zip](https://svn.apache.org/repos/asf/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/kuromoji.solrindex.zip) is identical with the default configuration but uses the above fieldType and dynamicField specification. + +### Usage with the EntityhubIndexing Tool + +1. Extract the [kuromoji.solrindex.zip](https://svn.apache.org/repos/asf/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/kuromoji.solrindex.zip) to the "indexing/config" directory +2. Rename the "indexing/config/kuromoji" directory to the {site-name} (the value of the "name" property of the "indexing/config/indexing.properties" file). + +As an alternative to (2) you can also explicitly configure the name of the solr config as value to the "solrConf:smartcn" of SolrYardIndexingDestination. + + :::text + indexingDestination=org.apache.stanbol.entityhub.indexing.destination.solryard.SolrYardIndexingDestination,solrConf:kuromoji,boosts:fieldboosts + +### Usage with the Entityhub SolrYard + +If you want to create an empty SolrYard instance using the [kuromoji.solrindex.zip](https://svn.apache.org/repos/asf/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/kuromoji.solrindex.zip) configuration you will need to + +1. copy the kuromoji.solrindex.zip to the datafile directory of your Stanbol instance ({working-dir}/stanbol/datafiles) +2. rename it to the {name} of the SolrYard you want to create. The file name needs to be {name}.solrindex.zip +3. create the SolrYard instance and configure the "Solr Index/Core" (org.apache.stanbol.entityhub.yard.solr.solrUri) to {name}. Make sure the "Use default SolrCore configuration" (org.apache.stanbol.entityhub.yard.solr.useDefaultConfig) is disabled. + +If you want to use the kuromoji.solrindex.zip as default you can rename the file in the datafilee folder to "default.solrindex.zip" and the enable the "Use default SolrCore configuration" (org.apache.stanbol.entityhub.yard.solr.useDefaultConfig) when you configure a SolrYard instance. + +See also the documentation on how to [configure a managed site](http://stanbol.apache.org/docs/trunk/components/entityhub/managedsite#configuration-of-managedsites)). Added: stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/pom.xml?rev=1455131&view=auto ============================================================================== --- stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/pom.xml (added) +++ stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/pom.xml Mon Mar 11 13:18:59 2013 @@ -0,0 +1,50 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + You under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <prerequisites> <!-- The maven-launchpad-plugin requires maven 3 --> + <maven>3.0.3</maven> + </prerequisites> + + <parent> + <groupId>org.apache.stanbol</groupId> + <artifactId>apache-stanbol-bundlelists</artifactId> + <version>0.10.0-SNAPSHOT</version> + <relativePath>../..</relativePath> + </parent> + + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.launchers.bundlelists.languageextras.kuromoji</artifactId> + <version>0.10.0-SNAPSHOT</version> + <packaging>partialbundlelist</packaging> + + <name>Apache Stanbol Bundlelist for Language Support: Kuromoji Japanese</name> + <description> + Provides modules that bring language support for Japanese using + the Solr/Lucene kuromoji analyzer. This includes a (1) Bundle providing the + Solr Analyzer; (2) an NLP processing Engine that Tokenizes, detects + sentences, POS taggs, extracts Named Entities and Lemmatizes Japanese text + (3) an LabelTokenizer needed to match tokens of the analyzed text with + the labels of Entities in the matched vocabularies. + </description> + + <build> + <plugins> + <plugin> + <groupId>org.apache.sling</groupId> + <artifactId>maven-launchpad-plugin</artifactId> + </plugin> + </plugins> + </build> +</project> Propchange: stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/pom.xml ------------------------------------------------------------------------------ svn:executable = * Added: stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/src/main/bundles/list.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/src/main/bundles/list.xml?rev=1455131&view=auto ============================================================================== --- stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/src/main/bundles/list.xml (added) +++ stanbol/branches/stanbol-solr4/launchers/bundlelists/language-extras/kuromoji/src/main/bundles/list.xml Mon Mar 11 13:18:59 2013 @@ -0,0 +1,58 @@ +<?xml version="1.0" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<bundles> + <!-- + The kuromoji analyzer bundle (extension to o.a.s.commons.solr.core module) + --> + <startLevel level="28"> <!-- commons.solr.core uses startlevel 27 --> + <bundle> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.commons.solr.extras.kuromoji</artifactId> + <version>0.12.0-SNAPSHOT</version> + </bundle> + </startLevel> + + <!-- + The kuromoji NLP processing engine + --> + <startLevel level="35"> <!-- same startlevel as other Enhancement Engines --> + <bundle> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.enhancer.engines.kuromoji.nlp</artifactId> + <version>0.10.1-SNAPSHOT</version> + </bundle> + </startLevel> + + <!-- + The Japanese LabelTokenizer required by the EntityLinkingEngine to compare + Tokens in the AnalyzedText with Labels of the Entities found in the + Controlled vocabulary. + --> + <!-- + startlevel needs to be greater as those of the EntityLinkingEngine + (o.a.s.enhancer.engines.entitylinking.engine) module + --> + <startLevel level="36"> + <bundle> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.kuromoji</artifactId> + <version>0.10.1-SNAPSHOT</version> + </bundle> + </startLevel> + +</bundles> \ No newline at end of file Modified: stanbol/branches/stanbol-solr4/launchers/bundlelists/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/launchers/bundlelists/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/launchers/bundlelists/pom.xml (original) +++ stanbol/branches/stanbol-solr4/launchers/bundlelists/pom.xml Mon Mar 11 13:18:59 2013 @@ -82,6 +82,7 @@ <!-- language specific extensions --> <module>language-extras/smartcn</module> <module>language-extras/paoding</module> + <module>language-extras/kuromoji</module> </modules> <profiles> Modified: stanbol/branches/stanbol-solr4/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml (original) +++ stanbol/branches/stanbol-solr4/launchers/bundlelists/stanbolcommons/src/main/bundles/list.xml Mon Mar 11 13:18:59 2013 @@ -134,6 +134,16 @@ <version>1.8.3_1</version> </bundle> <bundle> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>13.0.1</version> + </bundle> + <bundle> <!-- used by Solr4 spatial --> + <groupId>org.apache.servicemix.bundles</groupId> + <artifactId>org.apache.servicemix.bundles.spatial4j</artifactId> + <version>0.3_1</version> + </bundle> + <bundle> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpcore-osgi</artifactId> <version>4.2.3</version> @@ -229,11 +239,11 @@ <version>1.15</version> </bundle> <!-- needed to read data from mime multipart requests --> - <bundle> + <!-- bundle> <groupId>org.apache.clerezza</groupId> <artifactId>jaxrs.utils</artifactId> <version>0.6-incubating</version> - </bundle> + </bundle --> <!-- still used in many places also it only runs on jersey and the code is not portable across jax-rs implementations --> <bundle> <groupId>com.sun.jersey.contribs</groupId> Modified: stanbol/branches/stanbol-solr4/launchers/full/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/launchers/full/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/launchers/full/pom.xml (original) +++ stanbol/branches/stanbol-solr4/launchers/full/pom.xml Mon Mar 11 13:18:59 2013 @@ -283,19 +283,27 @@ <!-- Basic Cinese language support (STANBOL-855 --> <dependency> <groupId>org.apache.stanbol</groupId> - <artifactId>org.apache.stanbol.launchers.bundlelists.languageextras.smartcn</artifactId> - <version>0.10.0-SNAPSHOT</version> + <artifactId>org.apache.stanbol.launchers.bundlelists.languageextras.smartcn</artifactId> + <version>0.10.0-SNAPSHOT</version> <type>partialbundlelist</type> <scope>provided</scope> </dependency> - <dependency> + <!-- TODO Paoding does not yet support Solr 4 --> + <!-- dependency> <groupId>org.apache.stanbol</groupId> <artifactId>org.apache.stanbol.launchers.bundlelists.languageextras.paoding</artifactId> <version>0.10.0-SNAPSHOT</version> <type>partialbundlelist</type> <scope>provided</scope> + </dependency --> + <!-- Japanese Language Support --> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.launchers.bundlelists.languageextras.kuromoji</artifactId> + <version>0.10.0-SNAPSHOT</version> + <type>partialbundlelist</type> + <scope>provided</scope> </dependency> - </dependencies> </project> Modified: stanbol/branches/stanbol-solr4/parent/pom.xml URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-solr4/parent/pom.xml?rev=1455131&r1=1455130&r2=1455131&view=diff ============================================================================== --- stanbol/branches/stanbol-solr4/parent/pom.xml (original) +++ stanbol/branches/stanbol-solr4/parent/pom.xml Mon Mar 11 13:18:59 2013 @@ -60,7 +60,7 @@ <jersey-version>1.15</jersey-version> <freemarker-version>2.3.19</freemarker-version> <owlapi-version>3.3</owlapi-version> - <solr-version>3.6.1</solr-version> + <solr-version>4.1.0</solr-version> <pax-exam-version>2.3.0.M1</pax-exam-version> <sourceReleaseAssemblyDescriptor>stanbol-source-release-zip-tar</sourceReleaseAssemblyDescriptor> </properties> @@ -736,6 +736,13 @@ <artifactId>httpmime</artifactId> <version>4.2.1</version> </dependency> + + <!-- Google Commons --> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>13.0.1</version> + </dependency> <!-- Joda Time --> <dependency> @@ -966,12 +973,16 @@ <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> <version>${solr-version}</version> - <!-- exclusions> + <exclusions> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-jdk14</artifactId> </exclusion> - </exclusions --> + </exclusions> </dependency> <!-- dependency> <groupId>org.apache.solr</groupId> @@ -1023,7 +1034,7 @@ </dependency> <dependency> <groupId>org.apache.lucene</groupId> - <artifactId>lucene-analyzers</artifactId> + <artifactId>lucene-analyzers-common</artifactId> <version>${solr-version}</version> </dependency> <dependency> @@ -1038,6 +1049,11 @@ </dependency> <dependency> <groupId>org.apache.lucene</groupId> + <artifactId>lucene-codecs</artifactId> + <version>${solr-version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> <artifactId>lucene-misc</artifactId> <version>${solr-version}</version> </dependency> @@ -1048,9 +1064,29 @@ </dependency> <dependency> <groupId>org.apache.lucene</groupId> - <artifactId>lucene-phonetic</artifactId> + <artifactId>lucene-queryparser</artifactId> + <version>${solr-version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-phonetic</artifactId> + <version>${solr-version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-stempel</artifactId> <version>${solr-version}</version> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-smartcn</artifactId> + <version>${solr-version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-kuromoji</artifactId> + <version>${solr-version}</version> + </dependency> <!-- Snowball moved to analyzer in 3.1<dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-snowball</artifactId> @@ -1058,7 +1094,12 @@ </dependency> --> <dependency> <groupId>org.apache.lucene</groupId> - <artifactId>lucene-spellchecker</artifactId> + <artifactId>lucene-suggest</artifactId> + <version>${solr-version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-sandbox</artifactId> <version>${solr-version}</version> </dependency> <dependency> @@ -1068,7 +1109,7 @@ </dependency> <dependency> <groupId>org.apache.lucene</groupId> - <artifactId>lucene-icu</artifactId> + <artifactId>lucene-analyzers-icu</artifactId> <version>${solr-version}</version> </dependency> <dependency> @@ -1081,28 +1122,6 @@ <artifactId>lucene-grouping</artifactId> <version>${solr-version}</version> </dependency> - <!-- other unused Lucene bundes - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-queryparser</artifactId> - <version>${solr-version}</version> - </dependency> - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-smartcn</artifactId> - <version>${solr-version}</version> - </dependency> - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-stempel</artifactId> - <version>${solr-version}</version> - </dependency> - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-kuromoji</artifactId> - <version>${solr-version}</version> - </dependency> - --> <!-- Other Solr/Lucene dependendies --> <!-- StAX Parser (used by Solr/Lucene) --> <!-- dependency> stay api is included in java 1.6 @@ -1121,17 +1140,6 @@ </exclusion> </exclusions> </dependency> - <!-- dependency> - <groupId>stax</groupId> - <artifactId>stax-api</artifactId> - <version>1.0.1</version> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.codehaus.woodstox</groupId> - <artifactId>wstx-asl</artifactId> - <version>3.2.7</version> - </dependency --> <dependency> <groupId>org.apache.servicemix.bundles</groupId> <artifactId>org.apache.servicemix.bundles.regexp</artifactId> @@ -1147,6 +1155,21 @@ <artifactId>portlet-api</artifactId> <version>2.0</version> </dependency> + <dependency> + <groupId>org.apache.zookeeper</groupId> + <artifactId>zookeeper</artifactId> + <version>3.4.5</version> + <exclusions> + <exclusion> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + </exclusion> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </exclusion> + </exclusions> + </dependency> <!-- END Solr/Lucene dependencies --> <!-- JDom -->
