jena-text LowerCaseKeywordAnalyzer implementation

Osma Suominen Mon, 01 Sep 2014 05:39:13 -0700

Hi!

I liked the option to specify Analyzer for jena-text, as implemented inJENA-654. But I'd like to use an analyzer that is otherwise likeKeywordAnalyzer but case-insensitive, for use in anautocomplete/typeahead UI widget. Lucene doesn't include such ananalyzer, but there are several implementations of the same idea, e.g.in neo4j [1] and stargate [2].

I created my own implementation of such an analyzer and added code touse it from the assembler. Patch attached.

This analyzer is now in a new packageorg.apache.jena.query.text.analyzer, in case other analyzers forjena-text will appear in the future. If you don't like the new package,the class can of course be moved to org.apache.jena.query.text.

I also added a test for case-insensitivity. To avoid lots of duplicateboilerplate code, I slightly modified and subclassed the existing testfor KeywordAnalyzer.

I'd love to see this in the next version of jena-text and Fuseki. Ofcourse I'll rework the patch if necessary. I can also tweak the webdocumentation to mention this analyzer.


-Osma

[1]https://github.com/apatry/neo4j-lucene4-index/blob/master/src/main/java/org/neo4j/index/impl/lucene/LowerCaseKeywordAnalyzer.java

[2]https://github.com/tuplejump/stargate-core/blob/master/src/main/java/com/tuplejump/stargate/lucene/CaseInsensitiveKeywordAnalyzer.java


--
Osma Suominen
D.Sc. (Tech), Information Systems Specialist
National Library of Finland
P.O. Box 26 (Teollisuuskatu 23)
00014 HELSINGIN YLIOPISTO
Tel. +358 50 3199529
[email protected]
http://www.nationallibrary.fi

Index: jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java	(revision 1621760)
+++ jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java	(working copy)
@@ -44,6 +44,7 @@
     , TestDatasetWithSimpleAnalyzer.class
     , TestDatasetWithStandardAnalyzer.class
     , TestDatasetWithKeywordAnalyzer.class
+    , TestDatasetWithLowerCaseKeywordAnalyzer.class
 })
 
 public class TS_Text
Index: jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithKeywordAnalyzer.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithKeywordAnalyzer.java	(revision 1621760)
+++ jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithKeywordAnalyzer.java	(working copy)
@@ -47,9 +47,9 @@
 	private static final String SPEC_BASE = "http://example.org/spec#";;
 	private static final String SPEC_ROOT_LOCAL = "lucene_text_dataset";
 	private static final String SPEC_ROOT_URI = SPEC_BASE + SPEC_ROOT_LOCAL;
-	private static final String SPEC;
-	static {
-	    SPEC = StrUtils.strjoinNL(
+
+	private static String makeSpec(String analyzer) {
+	    return StrUtils.strjoinNL(
 					"prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> ",
 					"prefix ja:   <http://jena.hpl.hp.com/2005/11/Assembler#> ",
 					"prefix tdb:  <http://jena.hpl.hp.com/2008/tdb#>",
@@ -87,15 +87,15 @@
 				    "    text:map (",
 				    "         [ text:field \"label\" ; ",
 				    "           text:predicate rdfs:label ;",
-				    "           text:analyzer [ a text:KeywordAnalyzer ]",
+				    "           text:analyzer [ a " + analyzer + " ]",
 				    "         ]",
 				    "         [ text:field \"comment\" ; text:predicate rdfs:comment ]",
 				    "         ) ."
 				    );
 	}      
 	
-	public static void init() {
-		Reader reader = new StringReader(SPEC);
+	public static void init(String analyzer) {
+		Reader reader = new StringReader(makeSpec(analyzer));
 		Model specModel = ModelFactory.createDefaultModel();
 		specModel.read(reader, "", "TURTLE");
 		TextAssembler.init();			
@@ -111,7 +111,7 @@
 	}	
 
 	@BeforeClass public static void beforeClass() {
-		init();
+		init("text:KeywordAnalyzer");
 	}	
 	
 	@AfterClass public static void afterClass() {
Index: jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java	(revision 1621760)
+++ jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java	(working copy)
@@ -27,6 +27,7 @@
 import org.apache.jena.atlas.logging.LogCtl ;
 import org.apache.jena.query.text.EntityDefinition ;
 import org.apache.jena.query.text.TextIndexException ;
+import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer ;
 import org.apache.lucene.analysis.core.KeywordAnalyzer ;
 import org.apache.lucene.analysis.core.SimpleAnalyzer ;
 import org.apache.lucene.analysis.standard.StandardAnalyzer ;
@@ -55,6 +56,7 @@
 	private static final Resource spec3;
 	private static final Resource spec4;
 	private static final Resource spec5;
+	private static final Resource spec6;
 	private static final Resource specNoEntityField;
 	private static final Resource specNoDefaultField;
 	private static final Resource specNoMapProperty;
@@ -112,6 +114,12 @@
     	assertEquals(KeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass());
     }    
 	
+    @Test public void EntityHasMapEntryWithLowerCaseKeywordAnalyzer() {
+    	EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler();
+    	EntityDefinition entityDef = entDefAssem.open(Assembler.general, spec6,  null);
+    	assertEquals(LowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass());
+    }    
+	
 	@Test(expected=TextIndexException.class) public void errorOnNoEntityField() {
 		EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler();
 		entDefAssem.open(null, specNoEntityField, null);
@@ -231,6 +239,22 @@
 						    		    				    		           .addProperty(RDF.type, TextVocab.keywordAnalyzer))
 						    		    		  }));
 				
+		// create a simple entity map specification using a lowercase keyword analyzer
+		
+				spec6 = model.createResource(TESTBASE + "spec6")
+						     .addProperty(TextVocab.pEntityField, SPEC1_ENTITY_FIELD)
+						     .addProperty(TextVocab.pDefaultField, SPEC1_DEFAULT_FIELD)
+						     .addProperty(TextVocab.pMap,
+						    		      model.createList(
+						    		    		  new RDFNode[] {
+						    		    				model.createResource()
+						    		    				     .addProperty(TextVocab.pField, SPEC1_DEFAULT_FIELD)
+						    		    				     .addProperty(TextVocab.pPredicate, SPEC1_PREDICATE)
+						    		    				     .addProperty(TextVocab.pAnalyzer, 
+						    		    				    		      model.createResource()
+						    		    				    		           .addProperty(RDF.type, TextVocab.lowerCaseKeywordAnalyzer))
+						    		    		  }));
+				
 		// bad assembler spec
 				
 		specNoEntityField = 
Index: jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLowerCaseKeywordAnalyzer.java
===================================================================
--- jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLowerCaseKeywordAnalyzer.java	(revision 0)
+++ jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithLowerCaseKeywordAnalyzer.java	(revision 0)
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.jena.atlas.lib.StrUtils;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * This class defines a setup configuration for a dataset that uses a lowercase keyword analyzer with a Lucene index.
+ */
+public class TestDatasetWithLowerCaseKeywordAnalyzer extends TestDatasetWithKeywordAnalyzer {
+	@BeforeClass public static void beforeClass() {
+		init("text:LowerCaseKeywordAnalyzer");
+	}	
+
+	@Test
+	public void testLowerCaseKeywordAnalyzerIsCaseInsensitive() {
+		final String testName = "testLowerCaseKeywordAnalyzerIsCaseInsensitive";
+		final String turtle = StrUtils.strjoinNL(
+				TURTLE_PROLOG,
+				"<" + RESOURCE_BASE + testName + ">",
+				"  rdfs:label 'F;riM at&/ped9'",
+				"."
+				);
+		String queryString = StrUtils.strjoinNL(
+				QUERY_PROLOG,
+				"SELECT ?s",
+				"WHERE {",
+				"    ?s text:query ( rdfs:label 'f;ri*' 10 ) .",
+				"}"
+				);
+		Set<String> expectedURIs = new HashSet<>() ;
+		expectedURIs.addAll( Arrays.asList(RESOURCE_BASE + testName)) ;
+		doTestSearch(turtle, queryString, expectedURIs);
+	}
+}
Index: jena-text/src/main/java/org/apache/jena/query/text/assembler/LowerCaseKeywordAnalyzerAssembler.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/assembler/LowerCaseKeywordAnalyzerAssembler.java	(revision 0)
+++ jena-text/src/main/java/org/apache/jena/query/text/assembler/LowerCaseKeywordAnalyzerAssembler.java	(revision 0)
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler ;
+
+import org.apache.jena.query.text.TextIndexLucene;
+import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
+
+import com.hp.hpl.jena.assembler.Assembler;
+import com.hp.hpl.jena.assembler.Mode;
+import com.hp.hpl.jena.assembler.assemblers.AssemblerBase;
+import com.hp.hpl.jena.rdf.model.Resource;
+
+/**
+ * Assembler to create lowercase keyword analyzers.
+ */
+public class LowerCaseKeywordAnalyzerAssembler extends AssemblerBase {
+    /*
+    text:map (
+         [ text:field "text" ; 
+           text:predicate rdfs:label;
+           text:analyzer [
+               a  lucene:LowerCaseKeywordAnalyzer ;           ]
+         ]
+        .
+    */
+
+    @Override
+    public Analyzer open(Assembler a, Resource root, Mode mode) {
+    	return new LowerCaseKeywordAnalyzer(TextIndexLucene.VER);
+    }
+}
Index: jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java	(revision 1621760)
+++ jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java	(working copy)
@@ -54,6 +54,7 @@
     public static final Property pStopWords         = Vocab.property(NS, "stopWords");
     public static final Resource simpleAnalyzer     = Vocab.resource(NS, "SimpleAnalyzer");
     public static final Resource keywordAnalyzer    = Vocab.resource(NS, "KeywordAnalyzer");
+    public static final Resource lowerCaseKeywordAnalyzer    = Vocab.resource(NS, "LowerCaseKeywordAnalyzer");
 
 }
 
Index: jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java	(revision 1621760)
+++ jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java	(working copy)
@@ -33,6 +33,7 @@
         Assembler.general.implementWith(TextVocab.standardAnalyzer, new StandardAnalyzerAssembler()) ;
         Assembler.general.implementWith(TextVocab.simpleAnalyzer,	new SimpleAnalyzerAssembler()) ;
         Assembler.general.implementWith(TextVocab.keywordAnalyzer,	new KeywordAnalyzerAssembler()) ;
+        Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer,	new LowerCaseKeywordAnalyzerAssembler()) ;
     }
 }
 
Index: jena-text/src/main/java/org/apache/jena/query/text/analyzer/LowerCaseKeywordAnalyzer.java
===================================================================
--- jena-text/src/main/java/org/apache/jena/query/text/analyzer/LowerCaseKeywordAnalyzer.java	(revision 0)
+++ jena-text/src/main/java/org/apache/jena/query/text/analyzer/LowerCaseKeywordAnalyzer.java	(revision 0)
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.analyzer ;
+
+import java.io.Reader ;
+
+import org.apache.lucene.analysis.Analyzer ;
+import org.apache.lucene.analysis.core.KeywordTokenizer ;
+import org.apache.lucene.analysis.core.LowerCaseFilter ;
+import org.apache.lucene.util.Version ;
+
+
+/** 
+ * Lucene Analyzer implementation that works like KeywordAnalyzer (i.e.
+ * doesn't tokenize the input, keeps it as a single token), but forces text
+ * to lowercase and is thus case-insensitive.
+ */
+
+public class LowerCaseKeywordAnalyzer extends Analyzer {
+        private Version version;
+        
+        public LowerCaseKeywordAnalyzer(Version ver) {
+                this.version = ver;
+        }
+
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+                KeywordTokenizer source = new KeywordTokenizer(reader);
+                LowerCaseFilter filter = new LowerCaseFilter(version, source);
+                return new TokenStreamComponents(source, filter);
+        }
+
+}

jena-text LowerCaseKeywordAnalyzer implementation

Reply via email to