I'm aware that there's a desire to re-write the ftp portion of wget, but
here is a patch against CVS that so far allowed me to spider ftp URLs.
it's a dirty hack that simply uses the opt.spider variable to keep from
downloading files by returning RETROK (or maybe it was RETRFINISHED) after
observing whether there was an 505 or 200 from the "RETR"  command.  also
using opt.spider I attempted to stop any calculations or displaying of
downloads.  thus i didn't really verify whether this is the proper
protocol to "spider" in ftp, or whether all handles were closed properly.

and AFAICT, it appears to be spidering when --recursive is used.  right
now it will create the directories to write the ".listing" files (which
can be shut of with --no-directories).

I've been validating URLs in the GNU Free Software Directory's CVS
repository, and so far nothing has been downloaded into the working
directory (i do have --output-document set to /dev/null to make sure).  i
didn't completely verify whether --verbose or --debug are still outputing
legitimate information with --spider.

I think that's everything I know.

wget is great (especially --spider),
/a

ChangeLog and diff to ftp.c follow and are also attached.

2003-02-06  Aaron Hawley <[EMAIL PROTECTED]>

        * ftp.c
        (getftp): --spider option should now work with FTP.
        (ftp_loop_internal): --spider option will not calculate or
        show what was downloaded (nor delete from using --delete-after).
        (ftp_loop): --spider will not HTML-ify listing.

Index: ftp.c
===================================================================
RCS file: /pack/anoncvs/wget/src/ftp.c,v
retrieving revision 1.61
diff -u -r1.61 ftp.c
--- ftp.c       2003/01/11 20:12:35     1.61
+++ ftp.c       2003/02/07 01:48:37
@@ -818,6 +818,9 @@
       expected_bytes = ftp_expected_bytes (ftp_last_respline);
     } /* cmd & DO_LIST */

+  if (!(cmd & (DO_LIST | DO_RETR)) || (opt.spider && !(cmd & DO_LIST)))
+    return RETRFINISHED;
+
   /* Some FTP servers return the total length of file after REST
      command, others just return the remaining size. */
   if (*len && restval && expected_bytes
@@ -828,9 +831,6 @@
     }

   /* If no transmission was required, then everything is OK.  */
-  if (!(cmd & (DO_LIST | DO_RETR)))
-    return RETRFINISHED;
-
   if (!pasv_mode_open)  /* we are not using pasive mode so we need
                              to accept */
     {
@@ -1153,7 +1153,8 @@
        }
       /* Time?  */
       tms = time_str (NULL);
-      tmrate = retr_rate (len - restval, con->dltime, 0);
+      if (!opt.spider)
+        tmrate = retr_rate (len - restval, con->dltime, 0);

       /* If we get out of the switch above without continue'ing, we've
         successfully downloaded a file.  Remember this fact. */
@@ -1164,8 +1165,9 @@
          CLOSE (RBUF_FD (&con->rbuf));
          rbuf_uninitialize (&con->rbuf);
        }
-      logprintf (LOG_VERBOSE, _("%s (%s) - `%s' saved [%ld]\n\n"),
-                tms, tmrate, locf, len);
+      if (!opt.spider)
+        logprintf (LOG_VERBOSE, _("%s (%s) - `%s' saved [%ld]\n\n"),
+                  tms, tmrate, locf, len);
       if (!opt.verbose && !opt.quiet)
        {
          /* Need to hide the password from the URL.  The `if' is here
@@ -1192,7 +1194,7 @@
             by the more specific option --dont-remove-listing, and the code
             to do this deletion is in another function. */
        }
-      else
+      else if (!opt.spider)
        /* This is not a directory listing file. */
        {
          /* Unlike directory listing files, don't pretend normal files weren't
@@ -1718,7 +1720,7 @@

       if (res == RETROK)
        {
-         if (opt.htmlify)
+         if (opt.htmlify && !opt.spider)
            {
              char *filename = (opt.output_document
                                ? xstrdup (opt.output_document)
Index: ChangeLog
===================================================================
RCS file: /pack/anoncvs/wget/src/ChangeLog,v
retrieving revision 1.417
diff -u -r1.417 ChangeLog
--- ChangeLog   2003/01/11 20:12:35     1.417
+++ ChangeLog   2003/02/07 01:49:49
@@ -1,3 +1,11 @@
+2003-02-06  Aaron Hawley <[EMAIL PROTECTED]>
+
+       * ftp.c
+       (getftp): --spider option should now work with FTP.
+       (ftp_loop_internal): --spider option will not calculate or
+       show what was downloaded (nor delete from using --delete-after).
+       (ftp_loop): --spider will not HTML-ify listing.
+
 2003-01-11  Ian Abbott <[EMAIL PROTECTED]>
 
        * ftp.c (ftp_retrieve_glob): Reject insecure filenames as determined
Index: ftp.c
===================================================================
RCS file: /pack/anoncvs/wget/src/ftp.c,v
retrieving revision 1.61
diff -u -r1.61 ftp.c
--- ftp.c       2003/01/11 20:12:35     1.61
+++ ftp.c       2003/02/07 01:48:37
@@ -818,6 +818,9 @@
       expected_bytes = ftp_expected_bytes (ftp_last_respline);
     } /* cmd & DO_LIST */
 
+  if (!(cmd & (DO_LIST | DO_RETR)) || (opt.spider && !(cmd & DO_LIST)))
+    return RETRFINISHED;
+
   /* Some FTP servers return the total length of file after REST
      command, others just return the remaining size. */
   if (*len && restval && expected_bytes
@@ -828,9 +831,6 @@
     }
 
   /* If no transmission was required, then everything is OK.  */
-  if (!(cmd & (DO_LIST | DO_RETR)))
-    return RETRFINISHED;
-
   if (!pasv_mode_open)  /* we are not using pasive mode so we need
                              to accept */
     {
@@ -1153,7 +1153,8 @@
        }
       /* Time?  */
       tms = time_str (NULL);
-      tmrate = retr_rate (len - restval, con->dltime, 0);
+      if (!opt.spider)
+        tmrate = retr_rate (len - restval, con->dltime, 0);
 
       /* If we get out of the switch above without continue'ing, we've
         successfully downloaded a file.  Remember this fact. */
@@ -1164,8 +1165,9 @@
          CLOSE (RBUF_FD (&con->rbuf));
          rbuf_uninitialize (&con->rbuf);
        }
-      logprintf (LOG_VERBOSE, _("%s (%s) - `%s' saved [%ld]\n\n"),
-                tms, tmrate, locf, len);
+      if (!opt.spider)
+        logprintf (LOG_VERBOSE, _("%s (%s) - `%s' saved [%ld]\n\n"),
+                  tms, tmrate, locf, len);
       if (!opt.verbose && !opt.quiet)
        {
          /* Need to hide the password from the URL.  The `if' is here
@@ -1192,7 +1194,7 @@
             by the more specific option --dont-remove-listing, and the code
             to do this deletion is in another function. */
        }
-      else
+      else if (!opt.spider)
        /* This is not a directory listing file. */
        {
          /* Unlike directory listing files, don't pretend normal files weren't
@@ -1718,7 +1720,7 @@
 
       if (res == RETROK)
        {
-         if (opt.htmlify)
+         if (opt.htmlify && !opt.spider)
            {
              char *filename = (opt.output_document
                                ? xstrdup (opt.output_document)

Reply via email to