Update of /cvsroot/mahogany/M/src/util
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11918/src/util

Modified Files:
        matchurl.cpp 
Log Message:
more URL wrapping detection tuning: don't consider that slash terminates the URL; do 
consider that if the next line starts with something which looks like a word, then it 
doesn't make part of the URL

Index: matchurl.cpp
===================================================================
RCS file: /cvsroot/mahogany/M/src/util/matchurl.cpp,v
retrieving revision 1.35
retrieving revision 1.36
diff -b -u -2 -r1.35 -r1.36
--- matchurl.cpp        26 Jul 2004 12:26:22 -0000      1.35
+++ matchurl.cpp        12 Sep 2004 21:41:44 -0000      1.36
@@ -504,13 +504,4 @@
 static bool CanBeWrapped(const wxChar *p)
 {
-   // first check: if the last character on the previous line is a slash,
-   // suppose that it's the trailing slash at the end of URL.
-   //
-   // Rationale: slashes are relatively rare in the URLs and so it's unlikely
-   // that an URL is accidentally wrapped at one of them, but many URLs end in
-   // a slash
-   if ( p[-1] == '/' )
-      return false;
-
    // we consider any alphanumeric string of 3 characters an extension
    // but we have separate arrays of known extensions of other lengths
@@ -695,18 +686,4 @@
          }
 
-         // this test doesn't reduce the number of false positives all that
-         // much, finally, but does result in non recognizing some wrapped URLs
-#if 0
-         // even with all the checks below we still get too many false
-         // positives so consider that only "long" URLs are wrapped where long
-         // URLs are defined as the ones containing the CGI script parameters
-         // or some '%' chars (i.e. escaped characters)
-         if ( strcspn(start + len, "%?&\r") == (size_t)(p - start - len) )
-         {
-            // no CGI parameters, suppose it can't wrap
-            break;
-         }
-#endif // 0
-
          // heuristic text for end of URL detection
          if ( p - start > 5 && !CanBeWrapped(p) )
@@ -716,4 +693,6 @@
          }
 
+         p += 2; // go to the start of next line
+
          // Check that the beginning of next line is not the start of
          // another URL.
@@ -722,7 +701,9 @@
          // of an URL: here it should not be the case.
          int nextlen = 0;
-         int nextpos = scan(p + 2, nextlen);
-         if ( nextlen && nextpos == 0 && p[2] != '@')
+         int nextpos = scan(p, nextlen);
+         if ( nextlen && nextpos == 0 && *p != '@')
          {
+            p -= 2;
+
             // The start of the next line being the start of an URL on its own,
             // do not join the two.
@@ -730,4 +711,18 @@
          }
 
+         // check whether the next line starts with a word -- this is a good
+         // indication that the URL hasn't wrapped
+         const wxChar *q = p;
+         while ( wxIsalpha(*q) )
+            q++;
+
+         if ( *q == _T(' ') )
+         {
+            // looks like we've a word (i.e. sequence of letters terminated by
+            // space) at the start of the next line
+            p -= 2;
+            break;
+         }
+
          // it might be a wrapped URL but it might be not: it seems like we
          // get way too many false positives if we suppose that it's always
@@ -735,5 +730,5 @@
          // when they occur at the beginning of the line, possibly after some
          // white space as this is how people usually format them
-         const wxChar *q = start;
+         q = start;
          while ( q >= text && *q != '\n' )
          {
@@ -750,7 +745,7 @@
 
          // it did occur at the start (or after '<'), suppose the URL is
-         // wrapped and so continue on the next line (no need to test the first
-         // character, it had been already done above)
-         p += 3;
+         // wrapped and so we continue on the next line (and no need to test
+         // the first character, it had been already done above)
+         p++;
       }
    }



-------------------------------------------------------
This SF.Net email is sponsored by: YOU BE THE JUDGE. Be one of 170
Project Admins to receive an Apple iPod Mini FREE for your judgement on
who ports your project to Linux PPC the best. Sponsored by IBM. 
Deadline: Sept. 13. Go here: http://sf.net/ppc_contest.php
_______________________________________________
Mahogany-cvsupdates mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/mahogany-cvsupdates

Reply via email to