Matt <[EMAIL PROTECTED]> writes:

> Just a couple of note with cookies if someone hasn't already pointed
> them out
> 
> When using a proxy server, there will be a mismatch in
> check_domain_match() and fail on debug step 4

This has been fixed in 1.7.1 and later.

> the parsing of the cookie domain name will fail on many websites
> which have the domain (domainname.com) without the . in
> front. Places like altavista.com will fail but correct ones like
> google.com will pass

I've now changed the domain check code to act more like what is
specified in Netscape's "preliminary specification".


2001-12-01  Hrvoje Niksic  <[EMAIL PROTECTED]>

        * cookies.c (check_domain_match): Reimplement to match Netscape's
        "preliminary specification" for cookies.

Index: cookies.c
===================================================================
RCS file: /pack/anoncvs/wget/src/cookies.c,v
retrieving revision 1.12
diff -u -r1.12 cookies.c
--- cookies.c   2001/11/25 17:44:27     1.12
+++ cookies.c   2001/12/01 05:03:27
@@ -670,96 +670,70 @@
 }
 
 /* Check whether COOKIE_DOMAIN is an appropriate domain for HOST.
-   This check is compliant with rfc2109.  */
+   Originally I tried to make the check compliant with rfc2109, but
+   the sites deviated too often, so I had to fall back to "tail
+   matching", as defined by the original Netscape's cookie spec.  */
 
 static int
 check_domain_match (const char *cookie_domain, const char *host)
 {
-  int headlen;
-  const char *tail;
+  static char *special_toplevel_domains[] = {
+    ".com", ".edu", ".net", ".org", ".gov", ".mil", ".int"
+  };
+  int i, required_dots;
 
   DEBUGP (("cdm: 1"));
 
   /* Numeric address requires exact match.  It also requires HOST to
-     be an IP address.  I suppose we *could* resolve HOST with
-     store_hostaddress (it would hit the hash table), but rfc2109
-     doesn't require it, and it doesn't seem very useful, so we
-     don't.  */
+     be an IP address.  */
   if (numeric_address_p (cookie_domain))
-    return !strcmp (cookie_domain, host);
+    return 0 == strcmp (cookie_domain, host);
 
   DEBUGP ((" 2"));
 
-  /* The domain must contain at least one embedded dot. */
-  {
-    const char *rest = cookie_domain;
-    int len = strlen (rest);
-    if (*rest == '.')
-      ++rest, --len;           /* ignore first dot */
-    if (len <= 0)
-      return 0;
-    if (rest[len - 1] == '.')
-      --len;                   /* ignore last dot */
-
-    if (!memchr (rest, '.', len))
-      /* No dots. */
-      return 0;
-  }
-
-  DEBUGP ((" 3"));
-
   /* For the sake of efficiency, check for exact match first. */
   if (!strcasecmp (cookie_domain, host))
     return 1;
 
-  DEBUGP ((" 4"));
+  DEBUGP ((" 3"));
 
-  /* In rfc2109 terminology, HOST needs domain-match COOKIE_DOMAIN.
-     This means that COOKIE_DOMAIN needs to start with `.' and be an
-     FQDN, and that HOST must end with COOKIE_DOMAIN.  */
+  required_dots = 3;
+  for (i = 0; i < ARRAY_SIZE (special_toplevel_domains); i++)
+    if (match_tail (cookie_domain, special_toplevel_domains[i]))
+      {
+       required_dots = 2;
+       break;
+      }
+
+  /* If the domain does not start with '.', require one less dot.
+     This is so that domains like "altavista.com" (which should be
+     ".altavista.com") are accepted.  */
   if (*cookie_domain != '.')
-    return 0;
-
-  DEBUGP ((" 5"));
-
-  /* Two proceed, we need to examine two parts of HOST: its head and
-     its tail.  Head and tail are defined in terms of the length of
-     the domain, like this:
-
-       HHHHTTTTTTTTTTTTTTT  <- host
-           DDDDDDDDDDDDDDD  <- domain
-
-     That is, "head" is the part of the host before (dlen - hlen), and
-     "tail" is what follows.
+    --required_dots;
 
-     For the domain to match, two conditions need to be true:
-
-     1. Tail must equal DOMAIN.
-     2. Head must not contain an embedded dot.  */
-
-  headlen = strlen (host) - strlen (cookie_domain);
-
-  if (headlen <= 0)
-    /* DOMAIN must be a proper subset of HOST. */
+  if (count_char (cookie_domain, '.') < required_dots)
     return 0;
-  tail = host + headlen;
 
-  DEBUGP ((" 6"));
+  DEBUGP ((" 4"));
 
-  /* (1) */
-  if (strcasecmp (tail, cookie_domain))
+  if (!match_tail (host, cookie_domain))
     return 0;
 
-  DEBUGP ((" 7"));
-
-  /* Test (2) is not part of the "domain-match" itself, but is
-     recommended by rfc2109 for reasons of privacy.  */
+  DEBUGP ((" 5"));
 
-  /* (2) */
-  if (memchr (host, '.', headlen))
-    return 0;
+  /* Don't allow domain "bar.com" to match host "foobar.com".  */
+  if (*cookie_domain != '.')
+    {
+      int dlen = strlen (cookie_domain);
+      int hlen = strlen (host);
+      /* hostname.foobar.com                   */
+      /*             bar.com                   */
+      /*            ^ <-- must be '.' for host */
+      if (hlen > dlen && host[hlen - dlen - 1] != '.')
+       return 0;
+    }
 
-  DEBUGP ((" 8"));
+  DEBUGP ((" 6"));
 
   return 1;
 }
Index: utils.c
===================================================================
RCS file: /pack/anoncvs/wget/src/utils.c,v
retrieving revision 1.29
diff -u -r1.29 utils.c
--- utils.c     2001/11/29 18:48:43     1.29
+++ utils.c     2001/12/01 05:03:27
@@ -854,8 +854,8 @@
    match_backwards ("abc", "bc") -> 1
    match_backwards ("abc", "ab") -> 0
    match_backwards ("abc", "abc") -> 1 */
-static int
-match_backwards (const char *string, const char *pattern)
+int
+match_tail (const char *string, const char *pattern)
 {
   int i, j;
 
@@ -870,7 +870,7 @@
 }
 
 /* Checks whether string S matches each element of ACCEPTS.  A list
-   element are matched either with fnmatch() or match_backwards(),
+   element are matched either with fnmatch() or match_tail(),
    according to whether the element contains wildcards or not.
 
    If the BACKWARD is 0, don't do backward comparison -- just compare
@@ -891,7 +891,7 @@
        {
          if (backward)
            {
-             if (match_backwards (s, *accepts))
+             if (match_tail (s, *accepts))
                return 1;
            }
          else
Index: utils.h
===================================================================
RCS file: /pack/anoncvs/wget/src/utils.h,v
retrieving revision 1.13
diff -u -r1.13 utils.h
--- utils.h     2001/11/25 17:44:28     1.13
+++ utils.h     2001/12/01 05:03:27
@@ -69,6 +69,7 @@
 int acceptable PARAMS ((const char *));
 int accdir PARAMS ((const char *s, enum accd));
 char *suffix PARAMS ((const char *s));
+int match_tail PARAMS ((const char *, const char *));
 
 char *read_whole_line PARAMS ((FILE *));
 struct file_memory *read_file PARAMS ((const char *));

Reply via email to