Update of /cvsroot/mahogany/M/src/util
In directory usw-pr-cvs1:/tmp/cvs-serv28930/src/util

Modified Files:
        matchurl.cpp 
Log Message:
tweaks to the URL detection: don't include trailing ! and ? in it; try to 
automatically detect the URL end if it ends with an extension; disabled CGI parameters 
heuristic text

Index: matchurl.cpp
===================================================================
RCS file: /cvsroot/mahogany/M/src/util/matchurl.cpp,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -u -2 -r1.18 -r1.19
--- matchurl.cpp        8 Sep 2002 19:23:12 -0000       1.18
+++ matchurl.cpp        19 Oct 2002 21:44:01 -0000      1.19
@@ -555,4 +555,7 @@
          }
 
+         // this test doesn't reduce the number of false positives all that
+         // much, finally, but does result in non recognizing some wrapped URLs
+#if 0
          // even with all the checks below we still get too many false
          // positives so consider that only "long" URLs are wrapped where long
@@ -564,4 +567,36 @@
             break;
          }
+#endif // 0
+
+         // heuristic text for end of URL detection: consider that if it ends
+         // with an extension (i.e. ".xyz") then it's the end of the URL
+         if ( p - start > 5 && p[-4] == '.' &&
+               isalnum(p[-3]) && isalnum(p[-2]) && isalnum(p[-1]) )
+         {
+            // some special cases where the URL could still be wrapped: html,
+            // jpeg and tiff are common 4 letter extension
+            static const char *longExtensions[] =
+            {
+               "html", "jpeg", "tiff"
+            };
+
+            size_t n;
+            for ( n = 0; n < WXSIZEOF(longExtensions); n++ )
+            {
+               const char * const ext = longExtensions[n];
+               if ( strncmp(p - 3, ext, 3) == 0 && p[2] == ext[3] )
+               {
+                  // looks like a long extension got wrapped
+                  break;
+               }
+            }
+
+            if ( n == WXSIZEOF(longExtensions) )
+            {
+               // it doesn't look that extension is continued on the next line,
+               // consider this to be the end of the URL
+               break;
+            }
+         }
 
          // Check that the beginning of next line is not the start of
@@ -580,5 +615,5 @@
 
          // it might be a wrapped URL but it might be not: it seems like we
-         // get way too many false positives if we suppose that it's already
+         // get way too many false positives if we suppose that it's always
          // the case... so restrict the wrapped URLs detection to the case
          // when they occur at the beginning of the line, possibly after some
@@ -596,7 +631,7 @@
             break;
 
-         // it did occur at the start (or after '<'), suppose the URL is wrapped and 
so
-         // continue on the next line (no need to test the first character,
-         // it had been already done above)
+         // it did occur at the start (or after '<'), suppose the URL is
+         // wrapped and so continue on the next line (no need to test the first
+         // character, it had been already done above)
          p += 3;
       }
@@ -604,5 +639,5 @@
 
    // truncate any punctuation at the end
-   while ( strchr(".:,;)", *(p - 1)) )
+   while ( strchr(".:,;)!?", *(p - 1)) )
       p--;
 



-------------------------------------------------------
This sf.net email is sponsored by:
Access Your PC Securely with GoToMyPC. Try Free Now
https://www.gotomypc.com/s/OSND/DD
_______________________________________________
Mahogany-cvsupdates mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/mahogany-cvsupdates

Reply via email to