Hi!
I liked the option to specify Analyzer for jena-text, as implemented in
JENA-654. But I'd like to use an analyzer that is otherwise like
KeywordAnalyzer but case-insensitive, for use in an
autocomplete/typeahead UI widget. Lucene doesn't include such an
analyzer, but there are several implementations of the same idea, e.g.
in neo4j [1] and stargate [2].
I created my own implementation of such an analyzer and added code to
use it from the assembler. Patch attached.
This analyzer is now in a new package
org.apache.jena.query.text.analyzer, in case other analyzers for
jena-text will appear in the future. If you don't like the new package,
the class can of course be moved to org.apache.jena.query.text.
I also added a test for case-insensitivity. To avoid lots of duplicate
boilerplate code, I slightly modified and subclassed the existing test
for KeywordAnalyzer.
I'd love to see this in the next version of jena-text and Fuseki. Of
course I'll rework the patch if necessary. I can also tweak the web
documentation to mention this analyzer.
-Osma
[1]
https://github.com/apatry/neo4j-lucene4-index/blob/master/src/main/java/org/neo4j/index/impl/lucene/LowerCaseKeywordAnalyzer.java
[2]
https://github.com/tuplejump/stargate-core/blob/master/src/main/java/com/tuplejump/stargate/lucene/CaseInsensitiveKeywordAnalyzer.java
--
Osma Suominen
D.Sc. (Tech), Information Systems Specialist
National Library of Finland
P.O. Box 26 (Teollisuuskatu 23)
00014 HELSINGIN YLIOPISTO
Tel. +358 50 3199529
[email protected]
http://www.nationallibrary.fi
Index: jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java (revision 1621760)
+++ jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java (working copy)
@@ -44,6 +44,7 @@
, TestDatasetWithSimpleAnalyzer.class
, TestDatasetWithStandardAnalyzer.class
, TestDatasetWithKeywordAnalyzer.class
+ , TestDatasetWithLowerCaseKeywordAnalyzer.class
})
public class TS_Text
Index: jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithKeywordAnalyzer.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithKeywordAnalyzer.java (revision 1621760)
+++ jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithKeywordAnalyzer.java (working copy)
@@ -47,9 +47,9 @@
private static final String SPEC_BASE = "http://example.org/spec#";
private static final String SPEC_ROOT_LOCAL = "lucene_text_dataset";
private static final String SPEC_ROOT_URI = SPEC_BASE + SPEC_ROOT_LOCAL;
- private static final String SPEC;
- static {
- SPEC = StrUtils.strjoinNL(
+
+ private static String makeSpec(String analyzer) {
+ return StrUtils.strjoinNL(
"prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> ",
"prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> ",
"prefix tdb: <http://jena.hpl.hp.com/2008/tdb#>",
@@ -87,15 +87,15 @@
" text:map (",
" [ text:field \"label\" ; ",
" text:predicate rdfs:label ;",
- " text:analyzer [ a text:KeywordAnalyzer ]",
+ " text:analyzer [ a " + analyzer + " ]",
" ]",
" [ text:field \"comment\" ; text:predicate rdfs:comment ]",
" ) ."
);
}
- public static void init() {
- Reader reader = new StringReader(SPEC);
+ public static void init(String analyzer) {
+ Reader reader = new StringReader(makeSpec(analyzer));
Model specModel = ModelFactory.createDefaultModel();
specModel.read(reader, "", "TURTLE");
TextAssembler.init();
@@ -111,7 +111,7 @@
}
@BeforeClass public static void beforeClass() {
- init();
+ init("text:KeywordAnalyzer");
}
@AfterClass public static void afterClass() {
Index: jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java (revision 1621760)
+++ jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java (working copy)
@@ -27,6 +27,7 @@
import org.apache.jena.atlas.logging.LogCtl ;
import org.apache.jena.query.text.EntityDefinition ;
import org.apache.jena.query.text.TextIndexException ;
+import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer ;
import org.apache.lucene.analysis.core.KeywordAnalyzer ;
import org.apache.lucene.analysis.core.SimpleAnalyzer ;
import org.apache.lucene.analysis.standard.StandardAnalyzer ;
@@ -55,6 +56,7 @@
private static final Resource spec3;
private static final Resource spec4;
private static final Resource spec5;
+ private static final Resource spec6;
private static final Resource specNoEntityField;
private static final Resource specNoDefaultField;
private static final Resource specNoMapProperty;
@@ -112,6 +114,12 @@
assertEquals(KeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass());
}
+ @Test public void EntityHasMapEntryWithLowerCaseKeywordAnalyzer() {
+ EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler();
+ EntityDefinition entityDef = entDefAssem.open(Assembler.general, spec6, null);
+ assertEquals(LowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass());
+ }
+
@Test(expected=TextIndexException.class) public void errorOnNoEntityField() {
EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler();
entDefAssem.open(null, specNoEntityField, null);
@@ -231,6 +239,22 @@
.addProperty(RDF.type, TextVocab.keywordAnalyzer))
}));
+ // create a simple entity map specification using a lowercase keyword analyzer
+
+ spec6 = model.createResource(TESTBASE + "spec6")
+ .addProperty(TextVocab.pEntityField, SPEC1_ENTITY_FIELD)
+ .addProperty(TextVocab.pDefaultField, SPEC1_DEFAULT_FIELD)
+ .addProperty(TextVocab.pMap,
+ model.createList(
+ new RDFNode[] {
+ model.createResource()
+ .addProperty(TextVocab.pField, SPEC1_DEFAULT_FIELD)
+ .addProperty(TextVocab.pPredicate, SPEC1_PREDICATE)
+ .addProperty(TextVocab.pAnalyzer,
+ model.createResource()
+ .addProperty(RDF.type, TextVocab.lowerCaseKeywordAnalyzer))
+ }));
+
// bad assembler spec
specNoEntityField =
Index: jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLowerCaseKeywordAnalyzer.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLowerCaseKeywordAnalyzer.java (revision 0)
+++ jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLowerCaseKeywordAnalyzer.java (revision 0)
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.jena.atlas.lib.StrUtils;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * This class defines a setup configuration for a dataset that uses a lowercase keyword analyzer with a Lucene index.
+ */
+public class TestDatasetWithLowerCaseKeywordAnalyzer extends TestDatasetWithKeywordAnalyzer {
+ @BeforeClass public static void beforeClass() {
+ init("text:LowerCaseKeywordAnalyzer");
+ }
+
+ @Test
+ public void testLowerCaseKeywordAnalyzerIsCaseInsensitive() {
+ final String testName = "testLowerCaseKeywordAnalyzerIsCaseInsensitive";
+ final String turtle = StrUtils.strjoinNL(
+ TURTLE_PROLOG,
+ "<" + RESOURCE_BASE + testName + ">",
+ " rdfs:label 'F;riM at&/ped9'",
+ "."
+ );
+ String queryString = StrUtils.strjoinNL(
+ QUERY_PROLOG,
+ "SELECT ?s",
+ "WHERE {",
+ " ?s text:query ( rdfs:label 'f;ri*' 10 ) .",
+ "}"
+ );
+ Set<String> expectedURIs = new HashSet<>() ;
+ expectedURIs.addAll( Arrays.asList(RESOURCE_BASE + testName)) ;
+ doTestSearch(turtle, queryString, expectedURIs);
+ }
+}
Index: jena-text/src/main/java/org/apache/jena/query/text/assembler/LowerCaseKeywordAnalyzerAssembler.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/assembler/LowerCaseKeywordAnalyzerAssembler.java (revision 0)
+++ jena-text/src/main/java/org/apache/jena/query/text/assembler/LowerCaseKeywordAnalyzerAssembler.java (revision 0)
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler ;
+
+import org.apache.jena.query.text.TextIndexLucene;
+import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
+
+import com.hp.hpl.jena.assembler.Assembler;
+import com.hp.hpl.jena.assembler.Mode;
+import com.hp.hpl.jena.assembler.assemblers.AssemblerBase;
+import com.hp.hpl.jena.rdf.model.Resource;
+
+/**
+ * Assembler to create lowercase keyword analyzers.
+ */
+public class LowerCaseKeywordAnalyzerAssembler extends AssemblerBase {
+ /*
+ text:map (
+ [ text:field "text" ;
+ text:predicate rdfs:label;
+ text:analyzer [
+ a lucene:LowerCaseKeywordAnalyzer ; ]
+ ]
+ .
+ */
+
+ @Override
+ public Analyzer open(Assembler a, Resource root, Mode mode) {
+ return new LowerCaseKeywordAnalyzer(TextIndexLucene.VER);
+ }
+}
Index: jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java (revision 1621760)
+++ jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java (working copy)
@@ -54,6 +54,7 @@
public static final Property pStopWords = Vocab.property(NS, "stopWords");
public static final Resource simpleAnalyzer = Vocab.resource(NS, "SimpleAnalyzer");
public static final Resource keywordAnalyzer = Vocab.resource(NS, "KeywordAnalyzer");
+ public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer");
}
Index: jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java (revision 1621760)
+++ jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java (working copy)
@@ -33,6 +33,7 @@
Assembler.general.implementWith(TextVocab.standardAnalyzer, new StandardAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.simpleAnalyzer, new SimpleAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ;
+ Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ;
}
}
Index: jena-text/src/main/java/org/apache/jena/query/text/analyzer/LowerCaseKeywordAnalyzer.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/analyzer/LowerCaseKeywordAnalyzer.java (revision 0)
+++ jena-text/src/main/java/org/apache/jena/query/text/analyzer/LowerCaseKeywordAnalyzer.java (revision 0)
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.analyzer ;
+
+import java.io.Reader ;
+
+import org.apache.lucene.analysis.Analyzer ;
+import org.apache.lucene.analysis.core.KeywordTokenizer ;
+import org.apache.lucene.analysis.core.LowerCaseFilter ;
+import org.apache.lucene.util.Version ;
+
+
+/**
+ * Lucene Analyzer implementation that works like KeywordAnalyzer (i.e.
+ * doesn't tokenize the input, keeps it as a single token), but forces text
+ * to lowercase and is thus case-insensitive.
+ */
+
+public class LowerCaseKeywordAnalyzer extends Analyzer {
+ private Version version;
+
+ public LowerCaseKeywordAnalyzer(Version ver) {
+ this.version = ver;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ KeywordTokenizer source = new KeywordTokenizer(reader);
+ LowerCaseFilter filter = new LowerCaseFilter(version, source);
+ return new TokenStreamComponents(source, filter);
+ }
+
+}