Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv28359

Modified Files:
      Tag: Feb2009
        pftijah_tokenize.l 
Log Message:
- optimize non utf-8 tokenization and free strLower() results



U pftijah_tokenize.l
Index: pftijah_tokenize.l
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
retrieving revision 1.18.4.1
retrieving revision 1.18.4.2
diff -u -d -r1.18.4.1 -r1.18.4.2
--- pftijah_tokenize.l  24 Mar 2009 15:13:24 -0000      1.18.4.1
+++ pftijah_tokenize.l  25 Mar 2009 11:15:41 -0000      1.18.4.2
@@ -42,6 +42,7 @@
 #endif
 
 static char *c, *e;
+static int  use_multi_chars;
 
 /* This fix is to prevent problems with flex 2.5.33 and lower on Debian and
  * Gentoo systems. When flex.2.5.4 and higher is obligatory this define
@@ -117,31 +118,67 @@
 
 %%
 
+{TwoByteBaseChar}                  { use_multi_chars = 1; }
+
+{ThreeByteBaseChar}                { use_multi_chars = 1; }
+
 {S}                                { /* zap white space */ }
 
 {Word}                              { /* case WORD */
-                                      strLower(&e, pftijah_tokenizetext);
+                                     if ( use_multi_chars ) {
+                                       strLower(&e, pftijah_tokenizetext);
+                                     } else {
+                                        for (e = pftijah_tokenizetext; *e != 
'\0'; e++) 
+                                          *e = tolower(*e); 
+                                       e = pftijah_tokenizetext;
+                                     }
+                                     // stream_printf(GDKout,"EMIT[%s]\n",e);
                                      return 1;
                                    }
 
 {AlphaNumChar}{1,2}[-]{Word}        { /* case e-mail, e-government, on-line 
                                          problem: make-up */ 
-                                      /* strip hyphen */
-                                      bit repeat = 0;
-                                      strSubstitute(&c, pftijah_tokenizetext, 
"-", "", &repeat); 
-                                     strLower(&e, c);
+                                        /* strip hyphen */
+                                     if ( use_multi_chars ) {
+                                        bit repeat = 0;
+                                        strSubstitute(&c, 
pftijah_tokenizetext, "-", "", &repeat); 
+                                       strLower(&e, c);
+
+                                     } else {
+                                        e = pftijah_tokenizetext;
+                                        for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
+                                          if (*c != '-') 
+                                            *(e++) = tolower(*c);
+                                        *e = '\0';
+                                       e = pftijah_tokenizetext;
+                                     }
                                       return 1;
                                    }
 
 {Word}[\']{AlphaNumChar}{1,2}       { /* case CONTRACTION */ 
-                                      strLower(&e, pftijah_tokenizetext);
-                                      return 1;
+                                     if ( use_multi_chars ) 
+                                       strLower(&e, pftijah_tokenizetext);
+                                     else {
+                                        for (e = pftijah_tokenizetext; *e != 
'\0'; e++) 
+                                          *e = tolower(*e); 
+                                       e = pftijah_tokenizetext;
+                                     }
+                                     return 1;
                                    }
 
 {Capital}[\.]({Capital}[\.])+       { /* case ACRONYM */ 
-                                      bit repeat = 1;
-                                      strSubstitute(&c, pftijah_tokenizetext, 
".", "", &repeat); 
-                                     strLower(&e, c);
+                                     if ( use_multi_chars )  {
+                                        bit repeat = 1;
+                                        strSubstitute(&c, 
pftijah_tokenizetext, ".", "", &repeat); 
+                                       strLower(&e, c);
+                                     } else {
+                                        e = pftijah_tokenizetext;
+                                        for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
+                                          if (*c != '.') 
+                                            *(e++) = tolower(*c);
+                                        *e = '\0';
+                                        e = pftijah_tokenizetext;
+                                     }
                                       return 1;
                                     }
 
@@ -185,10 +222,15 @@
 int tokenize_flex(char* buf, struct tijahContextStruct* tjCtx) {
   // the original
   int len = strlen(buf);
+  use_multi_chars = 0;
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
   while (pftijah_tokenizelex()) {
       if ( !handleTijahTerm(tjCtx, e) )
           return 0;
+      if ( e != pftijah_tokenizetext )  {
+       GDKfree(e);
+      }
+      use_multi_chars = 0;
   }
   yy_delete_buffer(myBuf);
   return 1;
@@ -293,10 +335,15 @@
 char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
   int cnt = 0;
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
+  use_multi_chars = 0;
   while (pftijah_tokenizelex()) {
       if ( cnt++ )
           strcat(outbuf," ");
       strcat(outbuf,e);
+      if ( e != pftijah_tokenizetext )  {
+       GDKfree(e);
+      }
+      use_multi_chars = 0;
   }
   yy_delete_buffer(myBuf);
   return outbuf;
@@ -307,10 +354,14 @@
   char resBUFF[256];
   int len = strlen(buf);
 
+  use_multi_chars = 0;
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
   if ( pftijah_tokenizelex() ) {
       res = &resBUFF[0];
       strncpy(res,e,256);
+      if ( e != pftijah_tokenizetext )  {
+       GDKfree(e);
+      }
   } else
       res = NULL;
   yy_delete_buffer(myBuf);


------------------------------------------------------------------------------
Apps built with the Adobe(R) Flex(R) framework and Flex Builder(TM) are
powering Web 2.0 with engaging, cross-platform capabilities. Quickly and
easily build your RIAs with Flex Builder, the Eclipse(TM)based development
software that enables intelligent coding and step-through debugging.
Download the free 60 day trial. http://p.sf.net/sfu/www-adobe-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to