Re: using user-agent to identify for robots.txt

2003-05-30 Thread Aaron S. Hawley
This patch seems to do user-agent checks correctly (it might have been
broken previously) with a correction to a string comparison macro.

The patch also uses the value of the --user-agent option when enforcing
robots.txt rules.

this patch is against CVS, more on that here:
http://www.gnu.org/software/wget/

Index: init.c
===
RCS file: /pack/anoncvs/wget/src/init.c,v
retrieving revision 1.54
diff -u -u -r1.54 init.c
--- init.c  2002/08/03 20:34:57 1.54
+++ init.c  2003/05/29 17:51:50
@@ -271,6 +271,7 @@
   opt.timeout = 900;
 #endif
   opt.use_robots = 1;
+  opt.useragent = xstrdup (Wget);

   opt.remove_listing = 1;

Index: res.c
===
RCS file: /pack/anoncvs/wget/src/res.c,v
retrieving revision 1.7
diff -u -u -r1.7 res.c
--- res.c   2002/05/18 02:16:24 1.7
+++ res.c   2003/05/29 17:51:50
@@ -115,7 +115,7 @@
   *matches = 1;
   *exact_match = 0;
 }
-  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, wget))
+  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, opt.useragent))
 {
   *matches = 1;
   *exact_match = 1;
@@ -355,7 +355,7 @@
}
   else
{
- DEBUGP ((Ignoring unknown field at line %d, line_count));
+ DEBUGP ((Ignoring unknown field at line %d\n, line_count));
  goto next;
}

Index: wget.h
===
RCS file: /pack/anoncvs/wget/src/wget.h,v
retrieving revision 1.34
diff -u -u -r1.34 wget.h
--- wget.h  2002/05/18 02:16:25 1.34
+++ wget.h  2003/05/29 17:51:50
@@ -189,15 +189,15 @@
 /* Return non-zero if string bounded between BEG and END is equal to
STRING_LITERAL.  The comparison is case-sensitive.  */
 #define BOUNDED_EQUAL(beg, end, string_literal)\
-  ((end) - (beg) == sizeof (string_literal) - 1\
+  ((end) - (beg) == strlen (string_literal) - 1\
 !memcmp ((beg), (string_literal),\
-  sizeof (string_literal) - 1))
+  strlen (string_literal) - 1))

 /* The same as above, except the comparison is case-insensitive. */
 #define BOUNDED_EQUAL_NO_CASE(beg, end, string_literal)\
-  ((end) - (beg) == sizeof (string_literal) - 1\
+  ((end) - (beg) == strlen (string_literal)\
 !strncasecmp ((beg), (string_literal),   \
-   sizeof (string_literal) - 1))
+   strlen (string_literal)))

 /* Note that this much more elegant definition cannot be used:

On Wed, 28 May 2003, Christian von Ferber wrote:

 Hi,

 I am mirroring a friendly site that excludes robots in general but
 is supposed to allow my FriendlyMirror using wget.
 For this purpose I asked the webadmin to set up his robots.txt as follows:

 User-agent: FriendlyMirror
 Disallow:

 User-agent: *
 Disallow: /

 Starting Wget by

 wget --user-agent FriendlyMirror -m http://Friendly.Site

 Wget indeed identifies as user-agent FriendlyMirror to Friendly.Site
 but considers itself to be user-agent Wget when implementing the rules
 of robots.txt.

 I think it would be nice if Wget could be told to interpret robots.txt
 such that only my FriendlyMirror and not all other robots using wget
 will continue automatic download.

 Any Ideas ?

 Cheers,

 Christian

-- 
#undef MACROS? .libs
? Makefile
? config.h
? wget
Index: init.c
===
RCS file: /pack/anoncvs/wget/src/init.c,v
retrieving revision 1.54
diff -u -u -r1.54 init.c
--- init.c  2002/08/03 20:34:57 1.54
+++ init.c  2003/05/29 17:51:50
@@ -271,6 +271,7 @@
   opt.timeout = 900;
 #endif
   opt.use_robots = 1;
+  opt.useragent = xstrdup (Wget);
 
   opt.remove_listing = 1;
 
Index: res.c
===
RCS file: /pack/anoncvs/wget/src/res.c,v
retrieving revision 1.7
diff -u -u -r1.7 res.c
--- res.c   2002/05/18 02:16:24 1.7
+++ res.c   2003/05/29 17:51:50
@@ -115,7 +115,7 @@
   *matches = 1;
   *exact_match = 0;
 }
-  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, wget))
+  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, opt.useragent))
 {
   *matches = 1;
   *exact_match = 1;
@@ -355,7 +355,7 @@
}
   else
{
- DEBUGP ((Ignoring unknown field at line %d, line_count));
+ DEBUGP ((Ignoring unknown field at line %d\n, line_count));
  goto next;
}
 
Index: wget.h
===
RCS file: /pack/anoncvs/wget/src/wget.h,v
retrieving revision 1.34
diff -u -u -r1.34 wget.h
--- wget.h  2002/05/18 02:16:25 1.34
+++ wget.h  2003/05/29 17:51:50
@@ -189,15 +189,15 @@
 /* Return non-zero if string bounded between BEG and END is equal to
STRING_LITERAL.  The comparison is 

using user-agent to identify for robots.txt

2003-05-28 Thread Christian von Ferber
Hi,

I am mirroring a friendly site that excludes robots in general but
is supposed to allow my FriendlyMirror using wget.
For this purpose I asked the webadmin to set up his robots.txt as follows:

User-agent: FriendlyMirror
Disallow:

User-agent: *
Disallow: /

Starting Wget by

wget --user-agent FriendlyMirror -m http://Friendly.Site

Wget indeed identifies as user-agent FriendlyMirror to Friendly.Site
but considers itself to be user-agent Wget when implementing the rules
of robots.txt.

I think it would be nice if Wget could be told to interpret robots.txt
such that only my FriendlyMirror and not all other robots using wget
will continue automatic download.

Any Ideas ?

Cheers,

Christian