cutting 2003/12/22 14:12:24
Modified: . CHANGES.txt
src/java/org/apache/lucene/analysis/standard
StandardTokenizer.java StandardTokenizer.jj
StandardTokenizerConstants.java
StandardTokenizerTokenManager.java
Log:
Fix StandardTokenizer's handling of CJK characters.
Revision Changes Path
1.64 +6 -1 jakarta-lucene/CHANGES.txt
Index: CHANGES.txt
===================================================================
RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
retrieving revision 1.63
retrieving revision 1.64
diff -u -r1.63 -r1.64
--- CHANGES.txt 22 Dec 2003 21:42:48 -0000 1.63
+++ CHANGES.txt 22 Dec 2003 22:12:24 -0000 1.64
@@ -21,6 +21,11 @@
than the final token position. Position is used in phrase
searching (see PhraseQuery and Token.setPositionIncrement()).
+ 5. Fix StandardTokenizer's handling of CJK characters (Chinese,
+ Japanese and Korean ideograms). Previously contiguous sequences
+ were combined in a single token, which is not very useful. Now
+ each ideogram generates a separate token, which is more useful.
+
1.3 RC3
1.3 +6 -6
jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
Index: StandardTokenizer.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- StandardTokenizer.java 1 Oct 2003 16:39:26 -0000 1.2
+++ StandardTokenizer.java 22 Dec 2003 22:12:24 -0000 1.3
@@ -47,8 +47,8 @@
case NUM:
token = jj_consume_token(NUM);
break;
- case SIGRAM:
- token = jj_consume_token(SIGRAM);
+ case CJK:
+ token = jj_consume_token(CJK);
break;
case 0:
token = jj_consume_token(0);
@@ -79,7 +79,7 @@
jj_la1_0();
}
private static void jj_la1_0() {
- jj_la1_0 = new int[] {0x4ff,};
+ jj_la1_0 = new int[] {0x10ff,};
}
public StandardTokenizer(CharStream stream) {
@@ -158,8 +158,8 @@
public ParseException generateParseException() {
jj_expentries.removeAllElements();
- boolean[] la1tokens = new boolean[16];
- for (int i = 0; i < 16; i++) {
+ boolean[] la1tokens = new boolean[15];
+ for (int i = 0; i < 15; i++) {
la1tokens[i] = false;
}
if (jj_kind >= 0) {
@@ -175,7 +175,7 @@
}
}
}
- for (int i = 0; i < 16; i++) {
+ for (int i = 0; i < 15; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;
1.6 +2 -3
jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
Index: StandardTokenizer.jj
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- StandardTokenizer.jj 9 Oct 2003 12:24:11 -0000 1.5
+++ StandardTokenizer.jj 22 Dec 2003 22:12:24 -0000 1.6
@@ -125,7 +125,6 @@
(<LETTER>|<DIGIT>)*
>
-| < SIGRAM: (<CJK>)+ >
| < #ALPHA: (<LETTER>)+>
| < #LETTER: // unicode letters
[
@@ -137,7 +136,7 @@
"\u0100"-"\u1fff"
]
>
-| < #CJK: // non-alphabets
+| < CJK: // non-alphabets
[
"\u3040"-"\u318f",
"\u3300"-"\u337f",
@@ -187,7 +186,7 @@
token = <EMAIL> |
token = <HOST> |
token = <NUM> |
- token = <SIGRAM> |
+ token = <CJK> |
token = <EOF>
)
{
1.3 +5 -7
jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
Index: StandardTokenizerConstants.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- StandardTokenizerConstants.java 1 Oct 2003 16:39:26 -0000 1.2
+++ StandardTokenizerConstants.java 22 Dec 2003 22:12:24 -0000 1.3
@@ -13,12 +13,11 @@
int NUM = 7;
int P = 8;
int HAS_DIGIT = 9;
- int SIGRAM = 10;
- int ALPHA = 11;
- int LETTER = 12;
- int CJK = 13;
- int DIGIT = 14;
- int NOISE = 15;
+ int ALPHA = 10;
+ int LETTER = 11;
+ int CJK = 12;
+ int DIGIT = 13;
+ int NOISE = 14;
int DEFAULT = 0;
@@ -33,7 +32,6 @@
"<NUM>",
"<P>",
"<HAS_DIGIT>",
- "<SIGRAM>",
"<ALPHA>",
"<LETTER>",
"<CJK>",
1.4 +18 -18
jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
Index: StandardTokenizerTokenManager.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- StandardTokenizerTokenManager.java 9 Oct 2003 12:24:11 -0000 1.3
+++ StandardTokenizerTokenManager.java 22 Dec 2003 22:12:24 -0000 1.4
@@ -8,7 +8,7 @@
public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
private final int jjMoveStringLiteralDfa0_0()
{
- return jjMoveNfa_0(1, 0);
+ return jjMoveNfa_0(0, 0);
}
private final void jjCheckNAdd(int state)
{
@@ -101,7 +101,7 @@
{
switch(jjstateSet[--i])
{
- case 1:
+ case 0:
if ((0x3ff000000000000L & l) != 0L)
{
if (kind > 1)
@@ -111,6 +111,10 @@
if ((0x3ff000000000000L & l) != 0L)
jjCheckNAddStates(18, 23);
break;
+ case 1:
+ if ((0x3ff000000000000L & l) != 0L)
+ jjCheckNAddStates(18, 23);
+ break;
case 2:
case 39:
if ((0x3ff000000000000L & l) != 0L)
@@ -380,7 +384,7 @@
{
switch(jjstateSet[--i])
{
- case 1:
+ case 0:
if ((0x7fffffe07fffffeL & l) != 0L)
jjCheckNAddStates(30, 35);
if ((0x7fffffe07fffffeL & l) != 0L)
@@ -669,12 +673,11 @@
{
switch(jjstateSet[--i])
{
- case 1:
+ case 0:
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
{
- if (kind > 10)
- kind = 10;
- jjCheckNAdd(0);
+ if (kind > 12)
+ kind = 12;
}
if (jjCanMove_1(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(18, 23);
@@ -687,12 +690,9 @@
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
jjCheckNAddStates(30, 35);
break;
- case 0:
- if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 10)
- kind = 10;
- jjCheckNAdd(0);
+ case 1:
+ if (jjCanMove_1(hiByte, i1, i2, l1, l2))
+ jjCheckNAddStates(18, 23);
break;
case 2:
if (jjCanMove_2(hiByte, i1, i2, l1, l2))
@@ -1021,15 +1021,15 @@
}
public static final String[] jjstrLiteralImages = {
"", null, null, null, null, null, null, null, null, null, null, null, null,
-null, null, null, };
+null, null, };
public static final String[] lexStateNames = {
"DEFAULT",
};
static final long[] jjtoToken = {
- 0x4ffL,
+ 0x10ffL,
};
static final long[] jjtoSkip = {
- 0x8000L,
+ 0x4000L,
};
protected CharStream input_stream;
private final int[] jjrounds = new int[73];
@@ -1115,9 +1115,9 @@
jjmatchedKind = 0x7fffffff;
jjmatchedPos = 0;
curPos = jjMoveStringLiteralDfa0_0();
- if (jjmatchedPos == 0 && jjmatchedKind > 15)
+ if (jjmatchedPos == 0 && jjmatchedKind > 14)
{
- jjmatchedKind = 15;
+ jjmatchedKind = 14;
}
if (jjmatchedKind != 0x7fffffff)
{
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]