[ https://issues.apache.org/jira/browse/LUCENE-1003?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
TUSUR OpenTeam updated LUCENE-1003: ----------------------------------- Description: RussianAnalyzer's tokenizer skips numbers from input text, so that resulting token stream miss numbers. Problem can be solved by adding numbers to RussianCharsets.UnicodeRussian. See test case below for details. {code:title=TestRussianAnalyzer.java|borderStyle=solid} public class TestRussianAnalyzer extends TestCase { Reader reader = new StringReader("text 1000"); // test FAILS public void testStemmer() { testAnalyzer(new RussianAnalyzer()); } // test PASSES public void testFixedRussianAnalyzer() { testAnalyzer(new RussianAnalyzer(getRussianCharSet())); } private void testAnalyzer(RussianAnalyzer analyzer) { try { TokenStream stream = analyzer.tokenStream("text", reader); assertEquals("text", stream.next().termText()); assertNotNull(stream.next()); } catch (IOException e) { fail(e.getMessage()); } } private char[] getRussianCharSet() { int length = RussianCharsets.UnicodeRussian.length; final char[] russianChars = new char[length + 10]; System .arraycopy(RussianCharsets.UnicodeRussian, 0, russianChars, 0, length); russianChars[length++] = '0'; russianChars[length++] = '1'; russianChars[length++] = '2'; russianChars[length++] = '3'; russianChars[length++] = '4'; russianChars[length++] = '5'; russianChars[length++] = '6'; russianChars[length++] = '7'; russianChars[length++] = '8'; russianChars[length] = '9'; return russianChars; } } {code} was: RussianAnalyzer's tokenizer skips numbers from input text, so that resulting token stream miss numbers. Problem can be solved by adding numbers to RussianCharsets.UnicodeRussian. See test case below for details. {code:title=TestRussianAnalyzer.java|borderStyle=solid} public class TestRussianAnalyzer extends TestCase { Reader reader = new StringReader("text 1000"); public void testStemmer() { testAnalyzer(new RussianAnalyzer()); } public void testFixedRussianAnalyzer() { testAnalyzer(new RussianAnalyzer(getRussianCharSet())); } private void testAnalyzer(RussianAnalyzer analyzer) { try { TokenStream stream = analyzer.tokenStream("text", reader); assertEquals("text", stream.next().termText()); assertNotNull(stream.next()); } catch (IOException e) { fail(e.getMessage()); } } private char[] getRussianCharSet() { int length = RussianCharsets.UnicodeRussian.length; final char[] russianChars = new char[length + 10]; System .arraycopy(RussianCharsets.UnicodeRussian, 0, russianChars, 0, length); russianChars[length++] = '0'; russianChars[length++] = '1'; russianChars[length++] = '2'; russianChars[length++] = '3'; russianChars[length++] = '4'; russianChars[length++] = '5'; russianChars[length++] = '6'; russianChars[length++] = '7'; russianChars[length++] = '8'; russianChars[length] = '9'; return russianChars; } } {code} > RussianAnalyzer's tokenizer skips numbers from input text, > ---------------------------------------------------------- > > Key: LUCENE-1003 > URL: https://issues.apache.org/jira/browse/LUCENE-1003 > Project: Lucene - Java > Issue Type: Bug > Components: Analysis > Affects Versions: 2.2 > Reporter: TUSUR OpenTeam > > RussianAnalyzer's tokenizer skips numbers from input text, so that resulting > token stream miss numbers. Problem can be solved by adding numbers to > RussianCharsets.UnicodeRussian. See test case below for details. > {code:title=TestRussianAnalyzer.java|borderStyle=solid} > public class TestRussianAnalyzer extends TestCase { > Reader reader = new StringReader("text 1000"); > // test FAILS > public void testStemmer() { > testAnalyzer(new RussianAnalyzer()); > } > // test PASSES > public void testFixedRussianAnalyzer() { > testAnalyzer(new RussianAnalyzer(getRussianCharSet())); > } > private void testAnalyzer(RussianAnalyzer analyzer) { > try { > TokenStream stream = analyzer.tokenStream("text", reader); > assertEquals("text", stream.next().termText()); > assertNotNull(stream.next()); > } catch (IOException e) { > fail(e.getMessage()); > } > } > private char[] getRussianCharSet() { > int length = RussianCharsets.UnicodeRussian.length; > final char[] russianChars = new char[length + 10]; > System > .arraycopy(RussianCharsets.UnicodeRussian, 0, russianChars, 0, > length); > russianChars[length++] = '0'; > russianChars[length++] = '1'; > russianChars[length++] = '2'; > russianChars[length++] = '3'; > russianChars[length++] = '4'; > russianChars[length++] = '5'; > russianChars[length++] = '6'; > russianChars[length++] = '7'; > russianChars[length++] = '8'; > russianChars[length] = '9'; > return russianChars; > } > } > {code} -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online. --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]