Hi,

I find it useful to keep track of where files are downloaded from.  POSIX
extended attributes provide a lightweight portable method of keeping this
information across Linux, OS/X, FreeBSD and many other platforms.

This compliments wget's existing WARC support, which serves a related but
different use case closer to tcpdump or tar for web pages.  Extended
attributes can provide a quick answer to "where did I get this file from
again?"

This patch changes:
*   autoconf detects whether extended attributes are available and enables
the code if they are.
*   The new flags --xattr and --no-xattr control whether xattr is enabled.
*   The new command "xattr = (on|off)" can be used in ~/.wgetrc or
/etc/wgetrc
*   The original and redirected URLs are recorded as shown below.
*   This works for both single fetches and recursive mode.

Here is an example, where http://archive.org redirects to
https://archive.org:
$ wget --xattr http://archive.org
...
$ getfattr -d index.html
user.xdg.origin.url="https://archive.org/";
user.xdg.referrer.url="http://archive.org/";

These attributes were chosen based on those stored by Google Chrome (
https://bugs.chromium.org/p/chromium/issues/detail?id=45903) and curl (
https://github.com/curl/curl/blob/master/src/tool_xattr.c)

-- 
Sean Burford <[email protected]>
From c6a259bdc1de21b2487943d329f155ca4b14ff38 Mon Sep 17 00:00:00 2001
From: Sean Burford <[email protected]>
Date: Thu, 21 Jul 2016 14:15:49 +1000
Subject: [PATCH] [xattr] Keep fetched URLs in POSIX extended attributes

These attributes provide a lightweight method of later determining
where a file was downloaded from.

This patch changes:
*   autoconf detects whether extended attributes are available and
    enables the code if they are.
*   The new flags --xattr and --no-xattr control whether xattr is enabled.
*   The new command "xattr = (on|off)" can be used in ~/.wgetrc or /etc/wgetrc
*   The original and redirected URLs are recorded as shown below.
*   This works for both single fetches and recursive mode.

The attributes that are set are:
user.xdg.origin.url: The URL that the content was fetched from.
user.xdg.referrer.url: The URL that was originally requested.

Here is an example, where http://archive.org redirects to https://archive.org:
$ wget --xattr http://archive.org
...
$ getfattr -d index.html
user.xdg.origin.url="https://archive.org/";
user.xdg.referrer.url="http://archive.org/";

These attributes were chosen based on those stored by Google Chrome
https://bugs.chromium.org/p/chromium/issues/detail?id=45903
and curl https://github.com/curl/curl/blob/master/src/tool_xattr.c
---
 configure.ac    | 27 ++++++++++++++++++++++
 src/Makefile.am |  8 +++++--
 src/ftp.c       | 10 ++++++++
 src/http.c      | 23 ++++++++++++++++---
 src/init.c      |  9 ++++++++
 src/main.c      |  7 ++++++
 src/options.h   |  2 ++
 src/xattr.c     | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/xattr.h     | 45 ++++++++++++++++++++++++++++++++++++
 9 files changed, 197 insertions(+), 5 deletions(-)
 create mode 100644 src/xattr.c
 create mode 100644 src/xattr.h

diff --git a/configure.ac b/configure.ac
index eb303f0..cce20c6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -771,11 +771,37 @@ AS_IF([test "X$with_cares" == "Xyes"],[
   RESOLVER_INFO="libc, --bind-dns-address and --dns-servers not available"
 ])
 
+dnl
+dnl Extended Attribute support
+dnl
+
+AC_ARG_ENABLE([xattr],
+  [AS_HELP_STRING([--disable-xattr], [disable support for POSIX Extended Attributes])],
+  [ENABLE_XATTR=$enableval],
+  [ENABLE_XATTR=yes])
+
+case "$host_os" in
+  *linux* | *darwin*) xattr_syscalls="fsetxattr" ;;
+  freebsd*)           xattr_syscalls="extattr_set_fd" ;;
+  *)  AC_MSG_NOTICE([Disabling Extended Attribute support: your system is not known to support extended attributes.])
+      ENABLE_XATTR=no
+esac
+
+if test "X${ENABLE_XATTR}" = "Xyes"; then
+  AC_CHECK_FUNCS([$xattr_syscalls], [], [
+    AC_MSG_NOTICE([Disabling Extended Attribute support: your system does not support $xattr_syscalls])
+    ENABLE_XATTR=no
+  ])
+fi
+
+test "X${ENABLE_XATTR}" = "Xyes" && AC_DEFINE([ENABLE_XATTR], 1,
+    [Define if you want file meta-data storing into POSIX Extended Attributes compiled in.])
 
 dnl Needed by src/Makefile.am
 AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
 AM_CONDITIONAL([WITH_SSL], [test "X$with_ssl" != "Xno"])
 AM_CONDITIONAL([METALINK_IS_ENABLED], [test "X$with_metalink" != "Xno"])
+AM_CONDITIONAL([WITH_XATTR], [test "X$ENABLE_XATTR" != "Xno"])
 
 dnl
 dnl Create output
@@ -801,6 +827,7 @@ AC_MSG_NOTICE([Summary of build options:
   Digest:            $ENABLE_DIGEST
   NTLM:              $ENABLE_NTLM
   OPIE:              $ENABLE_OPIE
+  POSIX xattr:       $ENABLE_XATTR
   Debugging:         $ENABLE_DEBUG
   Assertions:        $ENABLE_ASSERTION
   Valgrind:          $VALGRIND_INFO
diff --git a/src/Makefile.am b/src/Makefile.am
index ef0b37e..5311bba 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -39,6 +39,10 @@ if METALINK_IS_ENABLED
 METALINK_OBJ = metalink.c
 endif
 
+if WITH_XATTR
+XATTR_OBJ = xattr.c
+endif
+
 # The following line is losing on some versions of make!
 DEFS     = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
 
@@ -49,14 +53,14 @@ wget_SOURCES = connect.c convert.c cookies.c ftp.c	\
 		css_.c css-url.c	\
 		ftp-basic.c ftp-ls.c hash.c host.c hsts.c html-parse.c html-url.c	\
 		http.c init.c log.c main.c netrc.c progress.c ptimer.c	\
-		recur.c res.c retr.c spider.c url.c warc.c	\
+		recur.c res.c retr.c spider.c url.c warc.c $(XATTR_OBJ) \
 		utils.c exits.c build_info.c $(IRI_OBJ) $(METALINK_OBJ)	\
 		css-url.h css-tokens.h connect.h convert.h cookies.h	\
 		ftp.h hash.h host.h hsts.h  html-parse.h html-url.h	\
 		http.h http-ntlm.h init.h log.h mswindows.h netrc.h	\
 		options.h progress.h ptimer.h recur.h res.h retr.h	\
 		spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h	\
-		exits.h version.h metalink.h
+		exits.h version.h metalink.h xattr.h
 nodist_wget_SOURCES = version.c
 EXTRA_wget_SOURCES = iri.c
 LDADD = $(LIBOBJS) ../lib/libgnu.a $(GETADDRINFO_LIB) $(HOSTENT_LIB) $(INET_NTOP_LIB) $(LIBSOCKET)\
diff --git a/src/ftp.c b/src/ftp.c
index 88a9777..27d90d6 100644
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -52,6 +52,9 @@ as that of the covered work.  */
 #include "recur.h"              /* for INFINITE_RECURSION */
 #include "warc.h"
 #include "c-strcase.h"
+#ifdef ENABLE_XATTR
+#include "xattr.h"
+#endif
 
 #ifdef __VMS
 # include "vms.h"
@@ -1546,6 +1549,13 @@ Error in server response, closing control connection.\n"));
   tmrate = retr_rate (rd_size, con->dltime);
   total_download_time += con->dltime;
 
+#ifdef ENABLE_XATTR
+  if (opt.enable_xattr)
+    {
+      set_file_metadata (u->url, NULL, fp);
+    }
+#endif
+
   fd_close (local_sock);
   /* Close the local file.  */
   if (!output_stream || con->cmd & DO_LIST)
diff --git a/src/http.c b/src/http.c
index 7e60a07..0cd142c 100644
--- a/src/http.c
+++ b/src/http.c
@@ -66,6 +66,9 @@ as that of the covered work.  */
 # include "metalink.h"
 # include "xstrndup.h"
 #endif
+#ifdef ENABLE_XATTR
+#include "xattr.h"
+#endif
 
 #ifdef TESTING
 #include "test.h"
@@ -2892,8 +2895,8 @@ fail:
    If PROXY is non-NULL, the connection will be made to the proxy
    server, and u->url will be requested.  */
 static uerr_t
-gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
-         struct iri *iri, int count)
+gethttp (struct url *u, struct url *original_url, struct http_stat *hs,
+         int *dt, struct url *proxy, struct iri *iri, int count)
 {
   struct request *req = NULL;
 
@@ -3754,6 +3757,20 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
       goto cleanup;
     }
 
+#ifdef ENABLE_XATTR
+  if (opt.enable_xattr)
+    {
+      if (original_url != u)
+        {
+          set_file_metadata (u->url, original_url->url, fp);
+        }
+      else
+        {
+          set_file_metadata (u->url, NULL, fp);
+        }
+    }
+#endif
+
   err = read_response_body (hs, sock, fp, contlen, contrange,
                             chunked_transfer_encoding,
                             u->url, warc_timestamp_str,
@@ -3972,7 +3989,7 @@ http_loop (struct url *u, struct url *original_url, char **newloc,
         *dt &= ~SEND_NOCACHE;
 
       /* Try fetching the document, or at least its head.  */
-      err = gethttp (u, &hstat, dt, proxy, iri, count);
+      err = gethttp (u, original_url, &hstat, dt, proxy, iri, count);
 
       /* Time?  */
       tms = datetime_str (time (NULL));
diff --git a/src/init.c b/src/init.c
index d043d83..06d2e44 100644
--- a/src/init.c
+++ b/src/init.c
@@ -339,6 +339,9 @@ static const struct {
 #ifdef USE_WATT32
   { "wdebug",           &opt.wdebug,            cmd_boolean },
 #endif
+#ifdef ENABLE_XATTR
+  { "xattr",            &opt.enable_xattr,      cmd_boolean },
+#endif
 };
 
 /* Look up CMDNAME in the commands[] and return its position in the
@@ -482,6 +485,12 @@ defaults (void)
   /* HSTS is enabled by default */
   opt.hsts = true;
 #endif
+
+#ifdef ENABLE_XATTR
+  opt.enable_xattr = true;
+#else
+  opt.enable_xattr = false;
+#endif
 }
 
 /* Return the user's home directory (strdup-ed), or NULL if none is
diff --git a/src/main.c b/src/main.c
index a5617da..4d69e03 100644
--- a/src/main.c
+++ b/src/main.c
@@ -436,6 +436,9 @@ static struct cmdline_option option_data[] =
 #ifdef USE_WATT32
     { "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 },
 #endif
+#ifdef ENABLE_XATTR
+    { "xattr", 0, OPT_BOOLEAN, "xattr", -1 },
+#endif
   };
 
 #undef IF_SSL
@@ -704,6 +707,10 @@ Download:\n"),
     N_("\
        --preferred-location        preferred location for Metalink resources\n"),
 #endif
+#ifdef ENABLE_XATTR
+    N_("\
+       --no-xattr                  turn off storage of metadata in extended file attributes\n"),
+#endif
     "\n",
 
     N_("\
diff --git a/src/options.h b/src/options.h
index 85d2de1..b2e31a8 100644
--- a/src/options.h
+++ b/src/options.h
@@ -127,6 +127,8 @@ struct options
   bool warc_keep_log;           /* Store the log file in a WARC record. */
   char **warc_user_headers;     /* User-defined WARC header(s). */
 
+  bool enable_xattr;            /* Store metadata in POSIX extended attributes. */
+
   char *user;                   /* Generic username */
   char *passwd;                 /* Generic password */
   bool ask_passwd;              /* Ask for password? */
diff --git a/src/xattr.c b/src/xattr.c
new file mode 100644
index 0000000..360b032
--- /dev/null
+++ b/src/xattr.c
@@ -0,0 +1,71 @@
+/* xattr.h -- POSIX Extended Attribute support.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "log.h"
+#include "xattr.h"
+
+#ifdef USE_XATTR
+
+static int
+write_xattr_metadata (const char *name, const char *value, FILE *fp) {
+  int retval = -1;
+  if (name && value && fp)
+    {
+      retval = fsetxattr (fileno(fp), name, value, strlen(value), 0);
+      /* FreeBSD's extattr_set_fd returns the length of the extended attribute. */
+      retval = (retval < 0)? retval : 0;
+    }
+  return retval;
+}
+
+#else /* USE_XATTR */
+
+static int
+write_xattr_metadata (const char *name, const char *value, FILE *fp) {
+  (void)name;
+  (void)value;
+  (void)fp;
+
+  return 0;
+}
+
+#endif /* USE_XATTR */
+
+int
+set_file_metadata (const char *origin_url, const char *referrer_url, FILE *fp) {
+  /* Save metadata about where the file came from (requested, final URLs) to
+   * user POSIX Extended Attributes of retrieved file.
+   *
+   * For more details about the user namespace see
+   * [http://freedesktop.org/wiki/CommonExtendedAttributes] and
+   * [http://0pointer.de/lennart/projects/mod_mime_xattr/].
+   */
+  int retval = -1;
+  if (!origin_url || !fp)
+    return retval;
+  retval = write_xattr_metadata ("user.xdg.origin.url", escnonprint_uri (origin_url), fp);
+  if ((!retval) && referrer_url)
+    {
+      retval = write_xattr_metadata ("user.xdg.referrer.url", escnonprint_uri (referrer_url), fp);
+    }
+  return retval;
+}
diff --git a/src/xattr.h b/src/xattr.h
new file mode 100644
index 0000000..375d34f
--- /dev/null
+++ b/src/xattr.h
@@ -0,0 +1,45 @@
+/* xattr.h -- POSIX Extended Attribute function mappings.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+
+#ifndef _XATTR_H
+#define _XATTR_H
+
+/* Store metadata name/value attributes against fp. */
+int set_file_metadata (const char *origin_url, const char *referrer_url, FILE *fp);
+
+#if defined(__linux)
+/* libc on Linux has fsetxattr (5 arguments). */
+#  include <sys/xattr.h>
+#  define USE_XATTR
+#elif defined(__APPLE__)
+/* libc on OS/X has fsetxattr (6 arguments). */
+#  include <sys/xattr.h>
+#  define fsetxattr(file, name, buffer, size, flags) \
+          fsetxattr((file), (name), (buffer), (size), 0, (flags))
+#  define USE_XATTR
+#elif defined(__FreeBSD_version) && (__FreeBSD_version > 500000)
+/* FreeBSD */
+#  include <sys/types.h>
+#  include <sys/extattr.h>
+#  define fsetxattr(file, name, buffer, size, flags) \
+          extattr_set_fd((file), EXTATTR_NAMESPACE_USER, (name), (buffer), (size))
+#  define USE_XATTR
+#endif
+
+#endif /* _XATTR_H */
-- 
2.8.0.rc3.226.g39d4020

Reply via email to