This patch seems to do user-agent checks correctly (it might have been
broken previously) with a correction to a string comparison macro.

The patch also uses the value of the --user-agent option when enforcing
robots.txt rules.

this patch is against CVS, more on that here:
http://www.gnu.org/software/wget/

Index: init.c
===================================================================
RCS file: /pack/anoncvs/wget/src/init.c,v
retrieving revision 1.54
diff -u -u -r1.54 init.c
--- init.c      2002/08/03 20:34:57     1.54
+++ init.c      2003/05/29 17:51:50
@@ -271,6 +271,7 @@
   opt.timeout = 900;
 #endif
   opt.use_robots = 1;
+  opt.useragent = xstrdup ("Wget");

   opt.remove_listing = 1;

Index: res.c
===================================================================
RCS file: /pack/anoncvs/wget/src/res.c,v
retrieving revision 1.7
diff -u -u -r1.7 res.c
--- res.c       2002/05/18 02:16:24     1.7
+++ res.c       2003/05/29 17:51:50
@@ -115,7 +115,7 @@
       *matches = 1;
       *exact_match = 0;
     }
-  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
+  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, opt.useragent))
     {
       *matches = 1;
       *exact_match = 1;
@@ -355,7 +355,7 @@
        }
       else
        {
-         DEBUGP (("Ignoring unknown field at line %d", line_count));
+         DEBUGP (("Ignoring unknown field at line %d\n", line_count));
          goto next;
        }

Index: wget.h
===================================================================
RCS file: /pack/anoncvs/wget/src/wget.h,v
retrieving revision 1.34
diff -u -u -r1.34 wget.h
--- wget.h      2002/05/18 02:16:25     1.34
+++ wget.h      2003/05/29 17:51:50
@@ -189,15 +189,15 @@
 /* Return non-zero if string bounded between BEG and END is equal to
    STRING_LITERAL.  The comparison is case-sensitive.  */
 #define BOUNDED_EQUAL(beg, end, string_literal)        \
-  ((end) - (beg) == sizeof (string_literal) - 1        \
+  ((end) - (beg) == strlen (string_literal) - 1        \
    && !memcmp ((beg), (string_literal),                \
-              sizeof (string_literal) - 1))
+              strlen (string_literal) - 1))

 /* The same as above, except the comparison is case-insensitive. */
 #define BOUNDED_EQUAL_NO_CASE(beg, end, string_literal)        \
-  ((end) - (beg) == sizeof (string_literal) - 1                \
+  ((end) - (beg) == strlen (string_literal)            \
    && !strncasecmp ((beg), (string_literal),           \
-                   sizeof (string_literal) - 1))
+                   strlen (string_literal)))

 /* Note that this much more elegant definition cannot be used:

On Wed, 28 May 2003, Christian von Ferber wrote:

> Hi,
>
> I am mirroring a friendly site that excludes robots in general but
> is supposed to allow my "FriendlyMirror" using wget.
> For this purpose I asked the webadmin to set up his robots.txt as follows:
>
> User-agent: FriendlyMirror
> Disallow:
>
> User-agent: *
> Disallow: /
>
> Starting Wget by
>
> wget --user-agent FriendlyMirror -m http://Friendly.Site
>
> Wget indeed identifies as user-agent "FriendlyMirror" to Friendly.Site
> but considers itself to be user-agent "Wget" when implementing the rules
> of robots.txt.
>
> I think it would be nice if Wget could be told to interpret robots.txt
> such that only my FriendlyMirror and not all other robots using wget
> will continue automatic download.
>
> Any Ideas ?
>
> Cheers,
>
> Christian

-- 
#undef MACROS
? .libs
? Makefile
? config.h
? wget
Index: init.c
===================================================================
RCS file: /pack/anoncvs/wget/src/init.c,v
retrieving revision 1.54
diff -u -u -r1.54 init.c
--- init.c      2002/08/03 20:34:57     1.54
+++ init.c      2003/05/29 17:51:50
@@ -271,6 +271,7 @@
   opt.timeout = 900;
 #endif
   opt.use_robots = 1;
+  opt.useragent = xstrdup ("Wget");
 
   opt.remove_listing = 1;
 
Index: res.c
===================================================================
RCS file: /pack/anoncvs/wget/src/res.c,v
retrieving revision 1.7
diff -u -u -r1.7 res.c
--- res.c       2002/05/18 02:16:24     1.7
+++ res.c       2003/05/29 17:51:50
@@ -115,7 +115,7 @@
       *matches = 1;
       *exact_match = 0;
     }
-  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
+  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, opt.useragent))
     {
       *matches = 1;
       *exact_match = 1;
@@ -355,7 +355,7 @@
        }
       else
        {
-         DEBUGP (("Ignoring unknown field at line %d", line_count));
+         DEBUGP (("Ignoring unknown field at line %d\n", line_count));
          goto next;
        }
 
Index: wget.h
===================================================================
RCS file: /pack/anoncvs/wget/src/wget.h,v
retrieving revision 1.34
diff -u -u -r1.34 wget.h
--- wget.h      2002/05/18 02:16:25     1.34
+++ wget.h      2003/05/29 17:51:50
@@ -189,15 +189,15 @@
 /* Return non-zero if string bounded between BEG and END is equal to
    STRING_LITERAL.  The comparison is case-sensitive.  */
 #define BOUNDED_EQUAL(beg, end, string_literal)        \
-  ((end) - (beg) == sizeof (string_literal) - 1        \
+  ((end) - (beg) == strlen (string_literal) - 1        \
    && !memcmp ((beg), (string_literal),                \
-              sizeof (string_literal) - 1))
+              strlen (string_literal) - 1))
 
 /* The same as above, except the comparison is case-insensitive. */
 #define BOUNDED_EQUAL_NO_CASE(beg, end, string_literal)        \
-  ((end) - (beg) == sizeof (string_literal) - 1                \
+  ((end) - (beg) == strlen (string_literal)            \
    && !strncasecmp ((beg), (string_literal),           \
-                   sizeof (string_literal) - 1))
+                   strlen (string_literal)))
 
 /* Note that this much more elegant definition cannot be used:
 

Reply via email to