Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java Mon Dec 21 13:53:50 2009 @@ -17,12 +17,19 @@ package org.apache.solr.analysis; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; - -import java.util.List; - +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * @version $Id:$ @@ -35,46 +42,75 @@ char[] ccc = "cCc".toCharArray(); char[] whitespace = " ".toCharArray(); char[] empty = "".toCharArray(); - TokenStream ts = new TrimFilter - (new IterTokenStream(new Token(a, 0, a.length, 1, 5), + TrimFilterFactory factory = new TrimFilterFactory(); + Map<String,String> args = new HashMap<String,String>(); + args.put("updateOffsets", "false"); + factory.init(args); + TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5), new Token(b, 0, b.length, 6, 10), new Token(ccc, 0, ccc.length, 11, 15), new Token(whitespace, 0, whitespace.length, 16, 20), - new Token(empty, 0, empty.length, 21, 21)), false); + new Token(empty, 0, empty.length, 21, 21))); - TermAttribute token; - assertTrue(ts.incrementToken()); - token = (TermAttribute) ts.getAttribute(TermAttribute.class); - assertEquals("a", new String(token.termBuffer(), 0, token.termLength())); - assertTrue(ts.incrementToken()); - assertEquals("b", new String(token.termBuffer(), 0, token.termLength())); - assertTrue(ts.incrementToken()); - assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength())); - assertTrue(ts.incrementToken()); - assertEquals("", new String(token.termBuffer(), 0, token.termLength())); - assertTrue(ts.incrementToken()); - assertEquals("", new String(token.termBuffer(), 0, token.termLength())); - assertFalse(ts.incrementToken()); + assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""}); a = " a".toCharArray(); b = "b ".toCharArray(); ccc = " c ".toCharArray(); whitespace = " ".toCharArray(); - ts = new TrimFilter(new IterTokenStream( + factory = new TrimFilterFactory(); + args = new HashMap<String,String>(); + args.put("updateOffsets", "true"); + factory.init(args); + ts = factory.create(new IterTokenStream( new Token(a, 0, a.length, 0, 2), new Token(b, 0, b.length, 0, 2), new Token(ccc, 0, ccc.length, 0, 3), - new Token(whitespace, 0, whitespace.length, 0, 3)), true); + new Token(whitespace, 0, whitespace.length, 0, 3))); + + assertTokenStreamContents(ts, + new String[] { "a", "b", "c", "" }, + new int[] { 1, 0, 1, 3 }, + new int[] { 2, 1, 2, 3 }, + new int[] { 1, 1, 1, 1 }); + } + + /** + * @deprecated does not support custom attributes + */ + private static class IterTokenStream extends TokenStream { + final Token tokens[]; + int index = 0; + TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); - List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3"); - List<Token> real = getTokens(ts); - for (Token t : expect) { - System.out.println("TEST:" + t); + public IterTokenStream(Token... tokens) { + super(); + this.tokens = tokens; } - for (Token t : real) { - System.out.println("REAL:" + t); + + public IterTokenStream(Collection<Token> tokens) { + this(tokens.toArray(new Token[tokens.size()])); + } + + public boolean incrementToken() throws IOException { + if (index >= tokens.length) + return false; + else { + clearAttributes(); + Token token = tokens[index++]; + termAtt.setTermBuffer(token.term()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + posIncAtt.setPositionIncrement(token.getPositionIncrement()); + flagsAtt.setFlags(token.getFlags()); + typeAtt.setType(token.type()); + payloadAtt.setPayload(token.getPayload()); + return true; + } } - assertTokEqualOff(expect, real); } - }
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=892821&r1=892820&r2=892821&view=diff ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original) +++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Mon Dec 21 13:53:50 2009 @@ -17,14 +17,14 @@ package org.apache.solr.analysis; -import org.apache.solr.util.AbstractSolrTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.KeywordTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -37,7 +37,7 @@ /** * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest */ -public class TestWordDelimiterFilter extends AbstractSolrTestCase { +public class TestWordDelimiterFilter extends BaseTokenTestCase { public String getSchemaFile() { return "solr/conf/schema.xml"; } public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; } @@ -144,148 +144,74 @@ // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter( - new TokenStream() { - Token t; - public Token next() throws IOException { - if (t!=null) return null; - t = new Token("foo-bar", 5, 12); // actual - return t; - } - }, + new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), 1,1,0,0,1,1,0); - int i=0; - for(Token t; (t=wdf.next())!=null;) { - String termText = new String(t.termBuffer(), 0, t.termLength()); - if (termText.equals("foo")) { - assertEquals(5, t.startOffset()); - assertEquals(8, t.endOffset()); - i++; - } - if (termText.equals("bar")) { - assertEquals(9, t.startOffset()); - assertEquals(12, t.endOffset()); - i++; - } - if (termText.equals("foobar")) { - assertEquals(5, t.startOffset()); - assertEquals(12, t.endOffset()); - i++; - } - } - assertEquals(3,i); // make sure all 3 tokens were generated + assertTokenStreamContents(wdf, + new String[] { "foo", "bar", "foobar" }, + new int[] { 5, 9, 5 }, + new int[] { 8, 12, 12 }); - // test that if splitting or catenating a synonym, that the offsets - // are not altered (they would be incorrect). wdf = new WordDelimiterFilter( - new TokenStream() { - Token t; - public Token next() throws IOException { - if (t!=null) return null; - t = new Token("foo-bar", 5, 6); // a synonym - return t; - } - }, + new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), 1,1,0,0,1,1,0); - for(Token t; (t=wdf.next())!=null;) { - assertEquals(5, t.startOffset()); - assertEquals(6, t.endOffset()); - } + + assertTokenStreamContents(wdf, + new String[] { "foo", "bar", "foobar" }, + new int[] { 5, 5, 5 }, + new int[] { 6, 6, 6 }); } public void testOffsetChange() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter( - new TokenStream() { - Token t; - public Token next() { - if (t != null) return null; - t = new Token("übelkeit)", 7, 16); - return t; - } - }, + new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)), 1,1,0,0,1,1,0 ); - Token t = wdf.next(); - - assertNotNull(t); - assertEquals("übelkeit", t.term()); - assertEquals(7, t.startOffset()); - assertEquals(15, t.endOffset()); + assertTokenStreamContents(wdf, + new String[] { "übelkeit" }, + new int[] { 7 }, + new int[] { 15 }); } public void testOffsetChange2() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter( - new TokenStream() { - Token t; - public Token next() { - if (t != null) return null; - t = new Token("(übelkeit", 7, 17); - return t; - } - }, + new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), 1,1,0,0,1,1,0 ); - Token t = wdf.next(); - - assertNotNull(t); - assertEquals("übelkeit", t.term()); - assertEquals(8, t.startOffset()); - assertEquals(17, t.endOffset()); + assertTokenStreamContents(wdf, + new String[] { "übelkeit" }, + new int[] { 8 }, + new int[] { 17 }); } public void testOffsetChange3() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter( - new TokenStream() { - Token t; - public Token next() { - if (t != null) return null; - t = new Token("(übelkeit", 7, 16); - return t; - } - }, + new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)), 1,1,0,0,1,1,0 ); - Token t = wdf.next(); - - assertNotNull(t); - assertEquals("übelkeit", t.term()); - assertEquals(8, t.startOffset()); - assertEquals(16, t.endOffset()); + assertTokenStreamContents(wdf, + new String[] { "übelkeit" }, + new int[] { 8 }, + new int[] { 16 }); } public void testOffsetChange4() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter( - new TokenStream() { - private Token t; - public Token next() { - if (t != null) return null; - t = new Token("(foo,bar)", 7, 16); - return t; - } - }, + new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), 1,1,0,0,1,1,0 ); - Token t = wdf.next(); - - assertNotNull(t); - assertEquals("foo", t.term()); - assertEquals(8, t.startOffset()); - assertEquals(11, t.endOffset()); - - t = wdf.next(); - - assertNotNull(t); - assertEquals("bar", t.term()); - assertEquals(12, t.startOffset()); - assertEquals(15, t.endOffset()); + assertTokenStreamContents(wdf, + new String[] { "foo", "bar", "foobar"}, + new int[] { 8, 12, 8 }, + new int[] { 11, 15, 15 }); } public void testAlphaNumericWords(){ @@ -338,24 +264,10 @@ public void doSplit(final String input, String... output) throws Exception { - WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { - boolean done=false; - @Override - public Token next() throws IOException { - if (done) return null; - done = true; - return new Token(input,0,input.length()); - } - } - ,1,1,0,0,0 - ); - - for(String expected : output) { - Token t = wdf.next(); - assertEquals(expected, t.term()); - } - - assertEquals(null, wdf.next()); + WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer( + new StringReader(input)), 1, 1, 0, 0, 0); + + assertTokenStreamContents(wdf, output); } public void testSplits() throws Exception { @@ -365,29 +277,38 @@ // non-space marking symbol shouldn't cause split // this is an example in Thai doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19"); + // possessive followed by delimiter + doSplit("test's'", "test"); + // some russian upper and lowercase + doSplit("РобеÑÑ", "РобеÑÑ"); + // now cause a split (russian camelCase) + doSplit("РобÐÑÑ", "Роб", "ÐÑÑ"); + // a composed titlecase character, don't split + doSplit("aÇ ungla", "aÇ ungla"); + + // a modifier letter, don't split + doSplit("سÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙØ§Ù ", "سÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙØ§Ù "); + + // enclosing mark, don't split + doSplit("Ûtest", "Ûtest"); + + // combining spacing mark (the virama), don't split + doSplit("हिनà¥à¤¦à¥", "हिनà¥à¤¦à¥"); + + // don't split non-ascii digits + doSplit("١٢٣٤", "١٢٣٤"); + + // don't split supplementaries into unpaired surrogates + doSplit("ð ð ", "ð ð "); } public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception { - WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { - boolean done=false; - @Override - public Token next() throws IOException { - if (done) return null; - done = true; - return new Token(input,0,input.length()); - } - } - ,1,1,0,0,0,1,0,1,stemPossessive,null - ); - - for(String expected : output) { - Token t = wdf.next(); - assertEquals(expected, t.term()); - } + WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer( + new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null); - assertEquals(null, wdf.next()); + assertTokenStreamContents(wdf, output); } /* @@ -485,25 +406,4 @@ new int[] { 6, 14, 19 }, new int[] { 1, 11, 1 }); } - - private void assertAnalyzesTo(Analyzer a, String input, String[] output, - int startOffsets[], int endOffsets[], int posIncs[]) throws Exception { - - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - TermAttribute termAtt = (TermAttribute) ts - .getAttribute(TermAttribute.class); - OffsetAttribute offsetAtt = (OffsetAttribute) ts - .getAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts - .getAttribute(PositionIncrementAttribute.class); - for (int i = 0; i < output.length; i++) { - assertTrue(ts.incrementToken()); - assertEquals(output[i], termAtt.term()); - assertEquals(startOffsets[i], offsetAtt.startOffset()); - assertEquals(endOffsets[i], offsetAtt.endOffset()); - assertEquals(posIncs[i], posIncAtt.getPositionIncrement()); - } - assertFalse(ts.incrementToken()); - ts.close(); - } } Added: lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt (added) +++ lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt Mon Dec 21 13:53:50 2009 @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A set of words for testing the DictionaryCompound factory +soft +ball +team Added: lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt?rev=892821&view=auto ============================================================================== --- lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt (added) +++ lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt Mon Dec 21 13:53:50 2009 @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A set of articles for testing the French Elision filter. +# Requiring a text file is a bit weird here... +l +m +t +qu +n +s +j
