Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv31507
Modified Files:
Tag: Feb2009
pftijah_tokenize.l
Log Message:
the previous fix was slightly incorrect.
(flex is not able to match two rules at the same time)
U pftijah_tokenize.l
Index: pftijah_tokenize.l
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
retrieving revision 1.18.4.2
retrieving revision 1.18.4.3
diff -u -d -r1.18.4.2 -r1.18.4.3
--- pftijah_tokenize.l 25 Mar 2009 11:15:41 -0000 1.18.4.2
+++ pftijah_tokenize.l 25 Mar 2009 13:00:47 -0000 1.18.4.3
@@ -107,78 +107,67 @@
/* XML Character Classes (http://www.w3.org/TR/REC-xml/#CharClasses) */
/* 8 bit characters from the Letter class
see http://www.w3.org/TR/REC-xml/#NT-Letter */
-Capital [\x41-\x5a\xc0-\xd6\xd8-\xde]
-
-/* restrictive Words
- Word
({Capital}{Small_Letter}+)|({Small_Letter}{Small_Letter}+) */
+Capital [\x41-\x5a\xc0-\xd6\xd8-\xde]
/* most relaxed condition for words, parsing almost everything expect
whitespace */
-Word {AlphaNumChar}{2,}
+Word {AlphaNumChar}{2,}
-%%
+AsciiAlphaNumChar {OneByteBaseChar}|{Digit}
-{TwoByteBaseChar} { use_multi_chars = 1; }
+AsciiWord {AsciiAlphaNumChar}{2,}
-{ThreeByteBaseChar} { use_multi_chars = 1; }
+%%
+
+{AsciiWord} { /* case simple WORD */
+ for (e = pftijah_tokenizetext; *e !=
'\0'; e++)
+ *e = tolower(*e);
+ e = pftijah_tokenizetext;
+ return 1;
+ }
{S} { /* zap white space */ }
{Word} { /* case WORD */
- if ( use_multi_chars ) {
- strLower(&e, pftijah_tokenizetext);
- } else {
- for (e = pftijah_tokenizetext; *e !=
'\0'; e++)
- *e = tolower(*e);
- e = pftijah_tokenizetext;
- }
- // stream_printf(GDKout,"EMIT[%s]\n",e);
+ strLower(&e, pftijah_tokenizetext);
return 1;
}
-{AlphaNumChar}{1,2}[-]{Word} { /* case e-mail, e-government, on-line
+{AsciiAlphaNumChar}{1,2}[-]{AsciiWord} { /* case e-mail, e-government, on-line
problem: make-up */
/* strip hyphen */
- if ( use_multi_chars ) {
- bit repeat = 0;
- strSubstitute(&c,
pftijah_tokenizetext, "-", "", &repeat);
- strLower(&e, c);
-
- } else {
- e = pftijah_tokenizetext;
- for (c = pftijah_tokenizetext; *c !=
'\0'; c++)
+ e = pftijah_tokenizetext;
+ for (c = pftijah_tokenizetext; *c !=
'\0'; c++)
if (*c != '-')
*(e++) = tolower(*c);
- *e = '\0';
- e = pftijah_tokenizetext;
- }
+ *e = '\0';
+ e = pftijah_tokenizetext;
return 1;
}
-{Word}[\']{AlphaNumChar}{1,2} { /* case CONTRACTION */
- if ( use_multi_chars )
- strLower(&e, pftijah_tokenizetext);
- else {
- for (e = pftijah_tokenizetext; *e !=
'\0'; e++)
- *e = tolower(*e);
- e = pftijah_tokenizetext;
- }
+{AsciiWord}[\']{AsciiAlphaNumChar}{1,2} { /* case CONTRACTION */
+ /* strip "'" */
+ e = pftijah_tokenizetext;
+ for (c = pftijah_tokenizetext; *c !=
'\0'; c++)
+ if (*c != '\'')
+ *(e++) = tolower(*c);
+ *e = '\0';
+ e = pftijah_tokenizetext;
+ return 1;
+ }
+{Word}[\']{AlphaNumChar}{1,2} { /* case CONTRACTION */
+ bit repeat = 1;
+ strSubstitute(&c, pftijah_tokenizetext,
"'", "", &repeat);
+ strLower(&e, c);
+ GDKfree(c);
return 1;
}
{Capital}[\.]({Capital}[\.])+ { /* case ACRONYM */
- if ( use_multi_chars ) {
- bit repeat = 1;
- strSubstitute(&c,
pftijah_tokenizetext, ".", "", &repeat);
- strLower(&e, c);
- } else {
- e = pftijah_tokenizetext;
- for (c = pftijah_tokenizetext; *c !=
'\0'; c++)
- if (*c != '.')
- *(e++) = tolower(*c);
- *e = '\0';
- e = pftijah_tokenizetext;
- }
+ bit repeat = 1;
+ strSubstitute(&c, pftijah_tokenizetext,
".", "", &repeat);
+ strLower(&e, c);
+ GDKfree(c);
return 1;
}
------------------------------------------------------------------------------
Apps built with the Adobe(R) Flex(R) framework and Flex Builder(TM) are
powering Web 2.0 with engaging, cross-platform capabilities. Quickly and
easily build your RIAs with Flex Builder, the Eclipse(TM)based development
software that enables intelligent coding and step-through debugging.
Download the free 60 day trial. http://p.sf.net/sfu/www-adobe-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins