Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv14762

Modified Files:
      Tag: Feb2009
        pftijah_tokenize.l 
Log Message:
fixed problems with utf-characters in tijah-tokenize


U pftijah_tokenize.l
Index: pftijah_tokenize.l
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
retrieving revision 1.18
retrieving revision 1.18.4.1
diff -u -d -r1.18 -r1.18.4.1
--- pftijah_tokenize.l  5 Aug 2008 15:01:59 -0000       1.18
+++ pftijah_tokenize.l  24 Mar 2009 15:13:24 -0000      1.18.4.1
@@ -36,6 +36,7 @@
 #include "serialize_pftijah.h"
 #include <ctype.h>
 #include <string.h>
+#include <str.h>
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
@@ -57,64 +58,90 @@
 
 %}
 
+/*
+ * XQuery 1.0 syntactic constructs
+ *
+ * lex/flex currently only supports 8 bit charsets. The character classes
+ * are thus limited to the lower 8 bits.
+ */
+
+/* XML Character Classes (http://www.w3.org/TR/REC-xml/#CharClasses) */
+
+/* see http://www.w3.org/TR/REC-xml/#NT-Letter */
+Letter                           {BaseChar}|{Ideographic}
+
+/* see http://www.w3.org/TR/REC-xml/#NT-BaseChar */
+OneByteBaseChar                  [\x41-\x5a\x61-\x7a]
+TwoByteBaseChar                  
\xc3[\x80-\x96\x98-\xb6\xb8-\xbf]|\xc4[\x80-\xb1\xb4-\xbe]|\xc5[\x81-\x88\x8a-\xbe]|\xc6[\x80-\xbf]|\xc7[\x80-\x83\x8d-\xb0\xb4\xb5\xba-\xbf]|\xc8[\x80-\x97]|\xc9[\x90-\xbf]|\xca[\x80-\xa8\xbb-\xbf]|\xcb[\x80\x81]|\xce[\x86\x88-\x8a\x8c\x8e-\xa1\xa3-\xbf]|\xcf[\x80-\x8e\x90-\x96\x9a\x9c\x9e\xa0\xa2-\xb3]|\xd0[\x81-\x8c\x8e-\xbf]|\xd1[\x80-\x8f\x91-\x9c\x9e-\xbf]|\xd2[\x80\x81\x90-\xbf]|\xd3[\x80-\x84\x87\x88\x8b\x8c\x90-\xab\xae-\xb5\xb8\xb9]|\xd4[\xb1-\xbf]|\xd5[\x80-\x96\x99\xa1-\xbf]|\xd6[\x80-\x86]|\xd7[\x90-\xaa\xb0-\xb2]|\xd8[\xa1-\xba]|\xd9[\x81-\x8a\xb1-\xbf]|\xda[\x80-\xb7\xba-\xbe]|\xdb[\x80-\x8e\x90-\x93\x95\xa5\xa6]
+ThreeByteBaseChar                
\xe0(\xa4[\x85-\xb9\xbd]|\xa5[\x98-\xa1]|\xa6[\x85-\x8c\x8f-\x90\x93-\xa8\xaa-\xb0\xb2\xb6-\xb9]|\xa7[\x9c\x9d\x9f-\xa1\xb0\xb1]|\xa8[\x85-\x8a\x8f\x90\x93-\xa8\xaa-\xb0\xb2\xb3\xb5\xb6\xb8\xb9]|\xa9[\x99-\x9c\x9e\xb2-\xb4]|\xaa[\x85-\x8b\x8d\x8f-\x91\x93-\xa8\xaa-\xb0\xb2\xb3\xb5-\xb9\xbd]|\xab\xa0|\xac[\x85-\x8c\x8f\90\x93-\xa8\xaa-\xb0\xb2\xb3\xb6-\xb9\xbd]|\xad[\x9c\x9d\x9f-\xa1]|\xae[\x85-\x8a\x8e-\x90\x92-\x95\x99-\x9a\x9c\x9e\x9f\xa3\xa4\xa8-\xaa\xae-\xb5\xb7-\xb9]|\xb0[\x85-\x8c\x8e\x90\x92-\xa8\xaa-\xb3\xb5-\xb9]|\xb1[\xa0\xa1]|\xb2[\x85-\x8c\x8e-\x90\x92-\xa8\xaa-\xb3\xb5-\xb9]|\xb3[\x9e\xa0\xa1]|\xb4[\x85-\x8c\x8e-\x90\x92-\xa8\xaa-\xb9]|\xb5[\xa0\xa1]|\xb8[\x81-\xae\xb0\xb2\xb3]|\xb9[\x80-\x85]|\xba[\x81\x82\x84\x87\x88\x8a\x8d\x94-\x97\x99-\x9f\xa1-\xa3\xa5\xa7\xaa\xab\xad-\xae\xb0\xb2\xb3\xbd]|\xbb[\x80-\x84]|\xbd[\x80-\x87\x89-\xa9])|\xe1(\x82[\xa0-\xbf]|\x83[\x80-\x85\x90-\xb6]|\x84[\x80\x82\x83\x85-\x87\x89\x8b\x8c\x8e-\x92\xbc\xbe]|\x85[\x80\x8c\x8e\x90\x94\x95\x99\x9f-\xa1\xa3\xa5\xa7\xa9\xad\xae\xb2\xb3\xb5]|\x86[\x9e\xa8\xab\xae\xaf\xb7\xb8\xba\xbc-\xbf]|\x87[\x80-\x82\xab\xb0\xb9]|\xb8[\80-\xbf]|\xb9[\x80-\xbf]|\xba[\x80-\x9b\xa0-\xbf]|\xbb[\x80-\xb9]|\xbc[\x80-\x95\x98-\x9d\xa0-\xbf]|\xbd[\x80-\x85\x88-\x8d\x90-\x97\x99\x9b\x9d\x9f-\xbd]|\xbe[\x80-\xb4\xb6-\xbc\xbe]|\xbf[\x82-\x84\x86-\x8c\x90-\x93\x96-\x9b\xa0-\xac\xb2\xb4\xb6-\xbc])|\xe2(\x84[\xa6\xaa\xab\xae]|\x86[\x80-\x82])|\xe3(\x81[\x81-\xbf]|\x82[\x80-\x94\xa1-\xbf]|\x83[\x80-\xba]|\x84[\x85-\xac])|\xea[\xb0-\xbf][\x80-\xbf]|[\xeb\xec][\x80-\xbf][\x80-\xbf]|\xed([\x80-\x9d][\x80-\xbf]|\x9e[\x80-\xa3])
+
+BaseChar                         
{OneByteBaseChar}|{TwoByteBaseChar}|{ThreeByteBaseChar}
+
+/* see http://www.w3.org/TR/REC-xml/#NT-Ideographic */
+Ideographic                      
\xe4[\xb8-\xbf][\x80-\xbf]|[\xe5-\xe8][\x80-\xbf][\x80-\xbf]|\xe9[\x80-\xbd][\x80-\xbf]|\xe9\xbe[\x80-\xa5]|\xe3\x80[\x87\xa1-\xa9]
+
+/* see http://www.w3.org/TR/REC-xml/#NT-CombiningChar */
+CombiningChar                    
\xcc[\x80-\xbf]|\xcd[\x80-\x85\xa0\xa1]|\xd2[\x83-\x86]|\xd6[\x91-\xa1\xa3-\xb9\xbb-\xbd\xbf]|\xd7[\x81\x82\x84]|\xd9[\x8b-\x92\xb0]|\xdb[\x96-\xa4\xa7\xa8\xaa-\xad]|\xe0(\xa4[\x81-\x83\xbc\xbe\xbf]|\xa5[\x80-\x8d\x91-\x94\xa2\xa3]|\xa6[\x81-\x83\xbc\xbe\xbf]|\xa7[\x80-\x84\x87\x88\x8b-\x8d\x97\xa2\xa3]|\xa8[\x82\xbc\xbe\xbf]|\xa9[\x80-\x82\x87\x88\x8b-\x8d\xb0\xb1]|\xaa[\x81-\x83\xbc\xbe\xbf]|\xab[\x80-\x85\x87-\x89\x8b-\x8d]|\xac[\x81-\x83\xbc\xbe\xbf]|\xad[\x80-\x83\x87\x88\x8b-\x8d\x96\x97]|\xae[\x82\x83\xbe\xbf]|\xaf[\x80-\x82\x86-\x88\x8a-\x8d\x97]|\xb0[\x81-\x83\xbe\xbf]|\xb1[\x80-\x84\x86-\x88\x8a-\x8d\x95\x96]|\xb2[\x82\x83\xbe\xbf]|\xb3[\80-\x84\x86-\x88\x8a-\x8d\x95\x96]|\xb4[\x82\x83\xbe\xbf]|\xb5[\x80-\x83\x86-\x88\x8a-\x8d\x97]|\xb8[\xb1\xb4-\xba]|\xb9[\x87-\x8e]|\xba[\xb1\xb4-\xb9\xbb\xbc]|\xbb[\x88-\x8d]|\xbc[\x98\x99\xb5\xb7\xb9\xbe\xbf]|\xbd[\xb1-\xbf]|\xbe[\x80-\x84\x86-\x8b\x90-\x95\x97\x99-\xad\xb1-\xb7\xb9])|\xe2(\x83[\x90-\x9c\xa1])|\xe3(\x80[\xaa-\xaf]|\x82[\x99\x9a])
+
+/* see http://www.w3.org/TR/REC-xml/#NT-Extender */
+Extender                         
\xc2\xb7|\xcb[\x90\x91]|\xce\x87|\xd9\x80|\e0[\xb9\xbb]\x86|\xe3\x80[\x85\xb1-\xb5]|\xe3\x82[\x9d\x9e]|\xe3\x83[\xbc-\xbe]
+
+/* Digit according to the XQuery definition */
+Digit                            [\x30-\x39]
+Digits                           {Digit}+
+
+/* XQuery WD [145] */
+PredefinedEntityRef              "&"("lt"|"gt"|"amp"|"quot"|"apos")";"
+
+
+/* XQuery WD [153] or http://www.w3.org/TR/REC-xml/#NT-CharRef */
+CharRef                          ("&#"[0-9]+";")|("&#x"[0-9a-fA-F]+";")
+
+/* XQuery WD [156]: Whitespace http://www.w3.org/TR/REC-xml/#NT-S */
+S                                {WhiteSpaceChar}+
+WhiteSpaceChar                   [\x20\x09\x0d\x0a]
+
+AlphaNumChar                    {Letter}|{Digit}|{CharRef}
+
 /* XML Character Classes (http://www.w3.org/TR/REC-xml/#CharClasses) */
 /* 8 bit characters from the Letter class
    see http://www.w3.org/TR/REC-xml/#NT-Letter */
 Capital                             [\x41-\x5a\xc0-\xd6\xd8-\xde]
-Small_Letter                        [\x61-\x7a\xdf-\xf6\xf8-\xff]
-Letter                              
[\x41-\x5a\x61-\x7a\xc0-\xd6\xd8-\xf6\xf8-\xff]
 
 /* restrictive Words
    Word                                
({Capital}{Small_Letter}+)|({Small_Letter}{Small_Letter}+) */
 
-/* 8 bit characters from the Digit class
-   see http://www.w3.org/TR/REC-xml/#NT-Digit */
-Digit                               [\x30-\x39]
-Digits                              {Digit}+
-AlphaNum                           
[\x30-\x39\x41-\x5a\x61-\x7a\xc0-\xd6\xd8-\xf6\xf8-\xff]
-NonAlphaNum                        
[^\x30-\x39\x41-\x5a\x61-\x7a\xc0-\xd6\xd8-\xf6\xf8-\xff]
 
 /* most relaxed condition for words, parsing almost everything expect 
whitespace */
-Word                                {AlphaNum}{2,}
-
-WhiteSpaceChar                      [\x20\x09\x0d\x0a]
-S                                   {WhiteSpaceChar}+
+Word                                {AlphaNumChar}{2,}
 
 %%
 
-[&]{Letter}+[;]                     { /* zap symbols */ }
-[&][#]{Digits}[;]                   { /* zap symbols */ } 
 {S}                                { /* zap white space */ }
 
-{Word}                              { /* case WORD */ 
-                                      for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
-                                        *c = tolower(*c); 
-                                      return 1;
+{Word}                              { /* case WORD */
+                                      strLower(&e, pftijah_tokenizetext);
+                                     return 1;
                                    }
 
-{Letter}{1,2}[-]{Word}              { /* case e-mail, e-government, on-line 
+{AlphaNumChar}{1,2}[-]{Word}        { /* case e-mail, e-government, on-line 
                                          problem: make-up */ 
                                       /* strip hyphen */
-                                      e = pftijah_tokenizetext;
-                                      for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
-                                        if (*c != '-') 
-                                          *(e++) = tolower(*c);
-                                      *e = '\0';
+                                      bit repeat = 0;
+                                      strSubstitute(&c, pftijah_tokenizetext, 
"-", "", &repeat); 
+                                     strLower(&e, c);
                                       return 1;
                                    }
 
-{Word}[\']{Letter}{1,2}             { /* case CONTRACTION */ 
-                                      for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
-                                        *c = tolower(*c); 
+{Word}[\']{AlphaNumChar}{1,2}       { /* case CONTRACTION */ 
+                                      strLower(&e, pftijah_tokenizetext);
                                       return 1;
                                    }
 
 {Capital}[\.]({Capital}[\.])+       { /* case ACRONYM */ 
-                                      e = pftijah_tokenizetext;
-                                      for (c = pftijah_tokenizetext; *c != 
'\0'; c++) 
-                                        if (*c != '.') 
-                                          *(e++) = tolower(*c);
-                                      *e = '\0';
+                                      bit repeat = 1;
+                                      strSubstitute(&c, pftijah_tokenizetext, 
".", "", &repeat); 
+                                     strLower(&e, c);
                                       return 1;
                                     }
 
@@ -160,7 +187,7 @@
   int len = strlen(buf);
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
   while (pftijah_tokenizelex()) {
-      if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
+      if ( !handleTijahTerm(tjCtx, e) )
           return 0;
   }
   yy_delete_buffer(myBuf);
@@ -269,7 +296,7 @@
   while (pftijah_tokenizelex()) {
       if ( cnt++ )
           strcat(outbuf," ");
-      strcat(outbuf,pftijah_tokenizetext);
+      strcat(outbuf,e);
   }
   yy_delete_buffer(myBuf);
   return outbuf;
@@ -283,7 +310,7 @@
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
   if ( pftijah_tokenizelex() ) {
       res = &resBUFF[0];
-      strncpy(res,pftijah_tokenizetext,256);
+      strncpy(res,e,256);
   } else
       res = NULL;
   yy_delete_buffer(myBuf);


------------------------------------------------------------------------------
Apps built with the Adobe(R) Flex(R) framework and Flex Builder(TM) are
powering Web 2.0 with engaging, cross-platform capabilities. Quickly and
easily build your RIAs with Flex Builder, the Eclipse(TM)based development
software that enables intelligent coding and step-through debugging.
Download the free 60 day trial. http://p.sf.net/sfu/www-adobe-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to