RussianAnalyzer's tokenizer skips numbers from input text, ----------------------------------------------------------
Key: LUCENE-1003 URL: https://issues.apache.org/jira/browse/LUCENE-1003 Project: Lucene - Java Issue Type: Bug Components: Analysis Affects Versions: 2.2 Reporter: TUSUR OpenTeam RussianAnalyzer's tokenizer skips numbers from input text, so that resulting token stream miss numbers. Problem can be solved by adding numbers to RussianCharsets.UnicodeRussian. See test case below for details. {code:title=TestRussianAnalyzer.java|borderStyle=solid} public class TestRussianAnalyzer extends TestCase { Reader reader = new StringReader("text 1000"); public void testStemmer() { testAnalyzer(new RussianAnalyzer()); } public void testFixedRussianAnalyzer() { testAnalyzer(new RussianAnalyzer(getRussianCharSet())); } private void testAnalyzer(RussianAnalyzer analyzer) { try { TokenStream stream = analyzer.tokenStream("text", reader); assertEquals("text", stream.next().termText()); assertNotNull(stream.next()); } catch (IOException e) { fail(e.getMessage()); } } private char[] getRussianCharSet() { int length = RussianCharsets.UnicodeRussian.length; final char[] russianChars = new char[length + 10]; System .arraycopy(RussianCharsets.UnicodeRussian, 0, russianChars, 0, length); russianChars[length++] = '0'; russianChars[length++] = '1'; russianChars[length++] = '2'; russianChars[length++] = '3'; russianChars[length++] = '4'; russianChars[length++] = '5'; russianChars[length++] = '6'; russianChars[length++] = '7'; russianChars[length++] = '8'; russianChars[length] = '9'; return russianChars; } } {code} -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online. --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]