ehatcher 2003/09/30 09:31:49 Modified: src/java/org/apache/lucene/analysis/standard StandardTokenizer.jj Log: #23466 - StandardTokenzier with CJK support(sigram) Revision Changes Path 1.4 +8 -2 jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj Index: StandardTokenizer.jj =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- StandardTokenizer.jj 5 Jun 2002 04:54:47 -0000 1.3 +++ StandardTokenizer.jj 30 Sep 2003 16:31:49 -0000 1.4 @@ -56,7 +56,7 @@ STATIC = false; //IGNORE_CASE = true; //BUILD_PARSER = false; -//UNICODE_INPUT = true; + UNICODE_INPUT = true; USER_CHAR_STREAM = true; OPTIMIZE_TOKEN_MANAGER = true; //DEBUG_TOKEN_MANAGER = true; @@ -125,6 +125,7 @@ (<LETTER>|<DIGIT>)* > +| < SIGRAM: (<CJK>)+ > | < #ALPHA: (<LETTER>)+> | < #LETTER: // unicode letters [ @@ -133,7 +134,11 @@ "\u00c0"-"\u00d6", "\u00d8"-"\u00f6", "\u00f8"-"\u00ff", - "\u0100"-"\u1fff", + "\u0100"-"\u1fff" + ] + > +| < #CJK: // non-alphabets + [ "\u3040"-"\u318f", "\u3300"-"\u337f", "\u3400"-"\u3d2d", @@ -182,6 +187,7 @@ token = <EMAIL> | token = <HOST> | token = <NUM> | + token = <SIGRAM> | token = <EOF> ) {
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]