Hello there, Wget users and hackers,

Since I was unable to find in 1.8.2 (the latest, to my knowledge)
version of Wget a very useful feature of removing files, symlinks and
directories not any longer present on server while mirroring FTP sites,
I implemented it myself.  It might be of use to someone else, so I'm
posting it here for review.  While writing it, I had to get working code
as soon as possible, so it is probably far from being perfect and highly
optimized.  It's fairly well commented, so if you think you can improve
it, both in design and implementation, be my guest.  Anyway, I'd like to
hear your feedback.  If it turns out worthwhile, eventually it could
probably get included in the next release of Wget in its refined form.

Thanks!

./danfe

PS.  I'm developing under FreeBSD, and my patch will not compile cleanly
under Linux unless two small changes are made: change "dp->d_namlen" to
"dp->d_reclen" on lines 158 and 159 of patch.  I wonder why in hell
Linux does things like that?? (With all due respect.)
diff -ur wget-1.8.2/src/ftp.c wget-1.8.2-J/src/ftp.c
--- wget-1.8.2/src/ftp.c        Sat May 18 10:05:16 2002
+++ wget-1.8.2-J/src/ftp.c      Tue Dec 24 10:03:28 2002
@@ -41,6 +41,7 @@
 # include <unistd.h>
 #endif
 #include <sys/types.h>
+#include <sys/param.h>
 #include <assert.h>
 #include <errno.h>
 
@@ -1237,6 +1238,9 @@
 static struct fileinfo *delelement PARAMS ((struct fileinfo *,
                                            struct fileinfo **));
 static void freefileinfo PARAMS ((struct fileinfo *f));
+#ifndef WINDOWS
+static void ftp_remove_missing (char *, struct fileinfo *);
+#endif /* WINDOWS */
 
 /* Retrieve a list of files given in struct fileinfo linked list.  If
    a file is a symbolic link, do not retrieve it, but rather try to
@@ -1282,6 +1286,12 @@
 
   err = RETROK;                        /* in case it's not used */
 
+#ifndef WINDOWS
+  if (opt.remove_old)
+    /* Remove left-overs first.  */
+    ftp_remove_missing (url_filename (u), f);
+#endif /* WINDOWS */
+
   while (f)
     {
       char *old_target, *ofile;
@@ -1757,3 +1767,221 @@
       f = next;
     }
 }
+
+#ifndef WINDOWS
+/* Synchronize local and remote directory contents (i.e. remove files,
+   directories, and symlinks missing on server).  We do this by obtaining
+   a list of local files first, sorting both linked lists, comparing
+   them, and removing local entries missing in server's file list.
+
+   This is required for coherent FTP mirroring.
+
+   Sorting is needed since we are not sure that both server and local
+   directory traversing will use the same sorting rules (hidden files
+   position in the list, case sensitivity, etc.  Without sorting at all,
+   comparing two file lists would take O(n^2) time, instead of O(n).
+
+   We could first check whether file lists are sorted in the same
+   manner, and if they are, do not attempt to further sort them, but
+   this seems kinda cumbersome and thus omitted.  If someone feels like
+   doing this, be my guest.
+
+   Implemented by Alexey Dokuchaev ([EMAIL PROTECTED]).  */
+
+static struct fileinfo *getlocallist (const char *, int *);
+int comparefileinfo (const void *, const void *);
+static void unlinkmissing (const char *, const char *);
+
+static void
+ftp_remove_missing (char *tgt, struct fileinfo *r)
+{
+  struct fileinfo **loc, **rmt, *l, *o;
+  int n = 1, m, i, j;
+
+  /* #### There probably is a better way of getting the number of list
+     elements, so I don't have to traverse the list.  I just didn't bother
+     to find it.  ;-) */
+  while (r->next)
+  {
+    n++;
+    r = r->next;
+  }
+  rmt = (struct fileinfo **)xmalloc (n * sizeof (struct fileinfo *));
+  for (i = n; i;)
+  {
+    rmt[--i] = r;
+    r = r->prev;
+  }
+
+  qsort (rmt, n, sizeof (struct fileinfo *), comparefileinfo);
+
+  j = strlen (tgt);
+  while (j && tgt[--j] != '/');
+  if (!j)
+    tgt[j++] = '.';
+  tgt[j] = '\0';
+
+  if (!(o = l = getlocallist (tgt, &m)))
+  {
+    logprintf (LOG_NOTQUIET, _("Failed to clean up `%s'.\n"), tgt);
+    xfree (rmt);
+    return;
+  }
+  else if (o == (struct fileinfo *)-1)
+  {
+    logprintf (LOG_VERBOSE, _("Directory `%s' is empty, nothing to clean up.\n"), 
+tgt);
+    xfree (rmt);
+    return;
+  }
+
+  loc = (struct fileinfo **)xmalloc (m * sizeof (struct fileinfo *));
+  while (i < m)
+  {
+    loc[i++] = l;
+    l = l->next;
+  }
+
+  qsort (loc, m, sizeof (struct fileinfo *), comparefileinfo);
+
+  for (i = j = 0; i < n && j < m;)
+  {
+    int q = strcmp (rmt[i]->name, loc[j]->name);
+
+    if (q > 0)
+      unlinkmissing (tgt, loc[j++]->name);
+    else
+    {
+      i++;
+      if (!q)
+       j++;
+    }
+  }
+
+  /* If any files are left locally after comparing remote and local
+     lists, remove them anyway.  */
+  while (j < m)
+    unlinkmissing (tgt, loc[j++]->name);
+
+  freefileinfo (o);
+  xfree (rmt);
+  xfree (loc);
+}
+
+/* Obtain linked list of local directory contents.  */
+static struct fileinfo *
+getlocallist (const char *dir, int *n)
+{
+  DIR *d;
+  struct dirent *dp;
+  struct fileinfo *cur, *prev = NULL, *orig = (struct fileinfo *)-1;
+
+  if (!(d = opendir (dir)))
+    return NULL;
+
+  for (*n = 0; (dp = readdir (d));)
+  {
+    /* #### Should check for DT_DIR || DT_REG || DT_LNK here.  Also,
+       might consider removing redundant strcmp()ing.  */
+    if (strcmp (dp->d_name, ".") && strcmp (dp->d_name, ".."))
+    {
+      cur = (struct fileinfo *)xmalloc (sizeof (struct fileinfo));
+      cur->type = dp->d_type;
+      cur->name = (char *)xmalloc (dp->d_namlen + 1);
+      memcpy (cur->name, dp->d_name, dp->d_namlen + 1);
+      cur->linkto = NULL;   /* for freefileinfo() */
+      cur->prev = prev;
+      cur->next = NULL;
+      if (prev)
+       prev->next = cur;
+      prev = cur;
+      if (orig == (struct fileinfo *)-1)
+       orig = cur;
+      (*n)++;
+    }
+  }
+  closedir (d);
+  return orig;   /* don't forget to freefileinfo() it! */
+}
+
+static int removedir (const char *);
+
+static void
+unlinkmissing (const char *path, const char *nm)
+{
+  struct stat sb;
+  char *fp = xmalloc (strlen (path) + strlen (nm) + 1);
+
+  strcpy (fp, path); strcat (fp, "/"); strcat (fp, nm);
+
+  if (!stat (fp, &sb))
+  {
+    if (!S_ISDIR (sb.st_mode))
+    {
+      if (unlink (fp))
+       logprintf (LOG_NOTQUIET, _("Could not remove stale file `%s': %s.\n"), nm,
+                       strerror (errno));
+      else
+       logprintf (LOG_VERBOSE, _("Removed stale file `%s'.\n"), nm);
+    }
+    else
+    {
+      if (removedir (fp))
+       logprintf (LOG_NOTQUIET, _("Could not remove stale directory `%s': %s.\n"),
+                       nm, strerror (errno));
+      else
+       logprintf (LOG_VERBOSE, _("Removed stale directory `%s'.\n"), nm);
+    }
+  }
+  else if (!lstat (fp, &sb))
+  {
+    if (unlink (fp))
+      logprintf (LOG_NOTQUIET, _("Could not remove stale symlink `%s': %s.\n"), nm,
+                     strerror (errno));
+    else
+      logprintf (LOG_VERBOSE, _("Removed stale symlink `%s'.\n"), nm);
+  }
+  xfree (fp);
+}
+
+static int
+removedir (const char *path)
+{
+  DIR *d;
+  struct dirent *dp;
+  char *fp, *p;
+  int r = 0;
+
+  if (!(d = opendir (path)))
+    return 1;
+
+  fp = xmalloc (strlen (path) + MAXPATHLEN);
+  strcpy (fp, path); strcat (fp, "/");
+  p = fp + strlen (fp);
+
+  while ((dp = readdir (d)) != NULL)
+  {
+    if (strcmp (dp->d_name, ".") && strcmp (dp->d_name, ".."))
+    {
+      strcat (fp, dp->d_name);
+
+      if (dp->d_type != DT_DIR)
+       r |= unlink (fp);
+      else
+       r |= removedir (fp);
+
+      *p = '\0';
+    }
+  }
+
+  xfree (fp);
+  closedir (d);
+  r |= rmdir (path);
+  return r;
+}
+
+int
+comparefileinfo (const void *a, const void *b)
+{
+  return strcmp((*(struct fileinfo **)a)->name, (*(struct fileinfo **)b)->name);
+}
+#endif /* WINDOWS */
diff -ur wget-1.8.2/src/ftp.h wget-1.8.2-J/src/ftp.h
--- wget-1.8.2/src/ftp.h        Sun May 19 10:04:53 2002
+++ wget-1.8.2-J/src/ftp.h      Fri Dec 20 21:53:31 2002
@@ -30,6 +30,9 @@
 #ifndef FTP_H
 #define FTP_H
 
+/* Need it for enum ftype.  */
+#include <dirent.h>
+
 /* Need it for struct rbuf.  */
 #include "rbuf.h"
 
@@ -61,10 +64,10 @@
 /* File types.  */
 enum ftype
 {
-  FT_PLAINFILE,
-  FT_DIRECTORY,
-  FT_SYMLINK,
-  FT_UNKNOWN
+  FT_PLAINFILE = DT_REG,
+  FT_DIRECTORY = DT_DIR,
+  FT_SYMLINK = DT_LNK,
+  FT_UNKNOWN = DT_UNKNOWN
 };
 
 
diff -ur wget-1.8.2/src/init.c wget-1.8.2-J/src/init.c
--- wget-1.8.2/src/init.c       Sat May 18 10:05:19 2002
+++ wget-1.8.2-J/src/init.c     Fri Dec 20 22:01:12 2002
@@ -180,6 +180,9 @@
   { "reject",          &opt.rejects,           cmd_vector },
   { "relativeonly",    &opt.relative_only,     cmd_boolean },
   { "removelisting",   &opt.remove_listing,    cmd_boolean },
+#ifndef WINDOWS
+  { "removeold",       &opt.remove_old,        cmd_boolean },
+#endif /* WINDOWS */
   { "retrsymlinks",    &opt.retr_symlinks,     cmd_boolean },
   { "robots",          &opt.use_robots,        cmd_boolean },
   { "savecookies",     &opt.cookies_output,    cmd_file },
@@ -266,6 +269,10 @@
   opt.dot_bytes = 1024;
   opt.dot_spacing = 10;
   opt.dots_in_line = 50;
+
+#ifndef WINDOWS
+  opt.remove_old = 0;
+#endif /* WINDOWS */
 }
 
 /* Return the user's home directory (strdup-ed), or NULL if none is
diff -ur wget-1.8.2/src/main.c wget-1.8.2-J/src/main.c
--- wget-1.8.2/src/main.c       Sat May 18 10:05:19 2002
+++ wget-1.8.2-J/src/main.c     Fri Dec 20 22:26:27 2002
@@ -222,6 +222,7 @@
   -k,  --convert-links      convert non-relative links to relative.\n\
   -K,  --backup-converted   before converting file X, back up as X.orig.\n\
   -m,  --mirror             shortcut option equivalent to -r -N -l inf -nr.\n\
+  -J,  --remove-old         remove files and directories not present on server.\n\
   -p,  --page-requisites    get all images, etc. needed to display HTML page.\n\
 \n"), stdout);
   fputs (_("\
@@ -280,6 +281,9 @@
     { "random-wait", no_argument, NULL, 165 },
     { "recursive", no_argument, NULL, 'r' },
     { "relative", no_argument, NULL, 'L' },
+#ifndef WINDOWS
+    { "remove-old", no_argument, NULL, 'J' },
+#endif /* WINDOWS */
     { "retr-symlinks", no_argument, NULL, 137 },
     { "save-headers", no_argument, NULL, 's' },
     { "server-response", no_argument, NULL, 'S' },
@@ -363,7 +367,7 @@
       that the options with required arguments must be followed by a ':'.
       -- Dan Harkless <[EMAIL PROTECTED]>] */
   while ((c = getopt_long (argc, argv, "\
-hpVqvdkKsxmNWrHSLcFbEY:G:g:T:U:O:l:n:i:o:a:t:D:A:R:P:B:e:Q:X:I:w:C:",
+hpVqvdkKsxmJNWrHSLcFbEY:G:g:T:U:O:l:n:i:o:a:t:D:A:R:P:B:e:Q:X:I:w:C:",
                           long_options, (int *)0)) != EOF)
     {
       switch (c)
@@ -437,6 +441,11 @@
        case 'F':
          setval ("forcehtml", "on");
          break;
+#ifndef WINDOWS
+       case 'J':
+         setval ("removeold", "on");
+         break;
+#endif /* WINDOWS */
        case 'H':
          setval ("spanhosts", "on");
          break;
diff -ur wget-1.8.2/src/options.h wget-1.8.2-J/src/options.h
--- wget-1.8.2/src/options.h    Sat May 18 10:05:20 2002
+++ wget-1.8.2-J/src/options.h  Fri Dec 20 22:09:49 2002
@@ -146,6 +146,10 @@
                                   locally? */
   int remove_listing;          /* Do we remove .listing files
                                   generated by FTP? */
+#ifndef WINDOWS
+  int remove_old;              /* Do we remove files and directories
+                                  not present on FTP server? */
+#endif /* WINDOWS */
   int htmlify;                 /* Do we HTML-ify the OS-dependent
                                   listings? */
 

Reply via email to