Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv31507

Modified Files:
      Tag: Feb2009
        pftijah_tokenize.l 
Log Message:
the previous fix was slightly incorrect.
(flex is not able to match two rules at the same time)



U pftijah_tokenize.l
Index: pftijah_tokenize.l
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
retrieving revision 1.18.4.2
retrieving revision 1.18.4.3
diff -u -d -r1.18.4.2 -r1.18.4.3
--- pftijah_tokenize.l  25 Mar 2009 11:15:41 -0000      1.18.4.2
+++ pftijah_tokenize.l  25 Mar 2009 13:00:47 -0000      1.18.4.3
@@ -107,78 +107,67 @@
 /* XML Character Classes (http://www.w3.org/TR/REC-xml/#CharClasses) */
 /* 8 bit characters from the Letter class
    see http://www.w3.org/TR/REC-xml/#NT-Letter */
-Capital                             [\x41-\x5a\xc0-\xd6\xd8-\xde]
-
-/* restrictive Words
-   Word                                
({Capital}{Small_Letter}+)|({Small_Letter}{Small_Letter}+) */
+Capital                          [\x41-\x5a\xc0-\xd6\xd8-\xde]
 
 
 /* most relaxed condition for words, parsing almost everything expect 
whitespace */
-Word                                {AlphaNumChar}{2,}
+Word                             {AlphaNumChar}{2,}
 
-%%
+AsciiAlphaNumChar               {OneByteBaseChar}|{Digit}
 
-{TwoByteBaseChar}                  { use_multi_chars = 1; }
+AsciiWord                       {AsciiAlphaNumChar}{2,}
 
-{ThreeByteBaseChar}                { use_multi_chars = 1; }
+%%
+
+{AsciiWord}                        { /* case simple WORD */ 
+                                      for (e = pftijah_tokenizetext; *e != 
'\0'; e++) 
+                                          *e = tolower(*e); 
+                                     e = pftijah_tokenizetext;
+                                     return 1;
+                                   }
 
 {S}                                { /* zap white space */ }
 
 {Word}                              { /* case WORD */
-                                     if ( use_multi_chars ) {
-                                       strLower(&e, pftijah_tokenizetext);
-                                     } else {
-                                        for (e = pftijah_tokenizetext; *e != 
'\0'; e++) 
-                                          *e = tolower(*e); 
-                                       e = pftijah_tokenizetext;
-                                     }
-                                     // stream_printf(GDKout,"EMIT[%s]\n",e);
+                                      strLower(&e, pftijah_tokenizetext);
                                      return 1;
                                    }
 
-{AlphaNumChar}{1,2}[-]{Word}        { /* case e-mail, e-government, on-line 
+{AsciiAlphaNumChar}{1,2}[-]{AsciiWord} { /* case e-mail, e-government, on-line 
                                          problem: make-up */ 
                                         /* strip hyphen */
-                                     if ( use_multi_chars ) {
-                                        bit repeat = 0;
-                                        strSubstitute(&c, 
pftijah_tokenizetext, "-", "", &repeat); 
-                                       strLower(&e, c);
-
-                                     } else {
-                                        e = pftijah_tokenizetext;
-                                        for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
+                                      e = pftijah_tokenizetext;
+                                      for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
                                           if (*c != '-') 
                                             *(e++) = tolower(*c);
-                                        *e = '\0';
-                                       e = pftijah_tokenizetext;
-                                     }
+                                      *e = '\0';
+                                     e = pftijah_tokenizetext;
                                       return 1;
                                    }
 
-{Word}[\']{AlphaNumChar}{1,2}       { /* case CONTRACTION */ 
-                                     if ( use_multi_chars ) 
-                                       strLower(&e, pftijah_tokenizetext);
-                                     else {
-                                        for (e = pftijah_tokenizetext; *e != 
'\0'; e++) 
-                                          *e = tolower(*e); 
-                                       e = pftijah_tokenizetext;
-                                     }
+{AsciiWord}[\']{AsciiAlphaNumChar}{1,2} { /* case CONTRACTION */ 
+                                        /* strip "'" */
+                                      e = pftijah_tokenizetext;
+                                      for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
+                                          if (*c != '\'') 
+                                            *(e++) = tolower(*c);
+                                      *e = '\0';
+                                     e = pftijah_tokenizetext;
+                                     return 1;
+                                   }
+{Word}[\']{AlphaNumChar}{1,2}      { /* case CONTRACTION */ 
+                                      bit repeat = 1;
+                                      strSubstitute(&c, pftijah_tokenizetext, 
"'", "", &repeat); 
+                                     strLower(&e, c);
+                                     GDKfree(c);
                                      return 1;
                                    }
 
 {Capital}[\.]({Capital}[\.])+       { /* case ACRONYM */ 
-                                     if ( use_multi_chars )  {
-                                        bit repeat = 1;
-                                        strSubstitute(&c, 
pftijah_tokenizetext, ".", "", &repeat); 
-                                       strLower(&e, c);
-                                     } else {
-                                        e = pftijah_tokenizetext;
-                                        for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
-                                          if (*c != '.') 
-                                            *(e++) = tolower(*c);
-                                        *e = '\0';
-                                        e = pftijah_tokenizetext;
-                                     }
+                                      bit repeat = 1;
+                                      strSubstitute(&c, pftijah_tokenizetext, 
".", "", &repeat); 
+                                     strLower(&e, c);
+                                     GDKfree(c);
                                       return 1;
                                     }
 


------------------------------------------------------------------------------
Apps built with the Adobe(R) Flex(R) framework and Flex Builder(TM) are
powering Web 2.0 with engaging, cross-platform capabilities. Quickly and
easily build your RIAs with Flex Builder, the Eclipse(TM)based development
software that enables intelligent coding and step-through debugging.
Download the free 60 day trial. http://p.sf.net/sfu/www-adobe-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to