aseek-devel  

[aseek-devel] Patches to a couple of indexing problems...

Matt Sullivan
Mon, 10 Sep 2001 05:29:43 -0700

Kir / All,

Attached are a couple of patches to problems that I've discovered (mostly) in
the indexer over the last few weeks.  Details are: 

1. Fixed mistaken indexing of robots.txt.  Some web sites output content (such
as their home page) on what should be 404 errors but with status 200.  Since
Content-Type and filename were both checked when indexing robots.txt it was
possible that robots.txt files were actually indexed mistakenly in these
instances (since mime type is not text/plain).  Patch forces status 404 if
filename is robots.txt and content type is not text/plain.

2. Minor fixes when HTTPS support not compiled in.  Was possible for
(supposedly) HTTPS files to be indexed mistakenly even with HTTPS support not
compiled in.  This would only occur when the exact same filepath was available
via both HTTP and HTTPS protocols (in other words URL
"https://someserver/somepath/"; would be added and indexed over port 80
mistakenly).

3. Fixed broken redirect handling.  Although RFC 2616 states that the RHS of a
"Location" header should be an AbsoluteURI, it is very common for
partial/relative URI's to be used in a Location header.  ASPSeek would not
follow partial URI's in redirects (both header and meta based).  Added
whitespace stripping and qualifying of partial URI's to support redirects
proper.  Redirects are now followed in exactly the same manner as HREF's in a
document (including addition of robot.txt when follow & followoutside, note
below).

4. Fixed Tolower call in META handling that caused URI to be lower cased thus
breaking redirect in case where remote system was case sensitive in it's URI
handling.

5. Fixed robots.txt URL parameter addition in HREF and Redirect code to
correctly add robots.txt with referrer 0, hops count 0 and next index time
24hrs < now (robots.txt were not added at all in redirect case - which caused
problems with FollowOutside on). 

6. Also (not included in attached patch) newer autoconf needs addition of
AC_EXEEXT macro in configure.in prior to AC_LANG_CPLUSPLUS etc.  Effect of
missing macro is EXE extension mistakenly set to ".C".  Patch is: 

diff -ruN aspseek-1.2.4a.orig/configure.in aspseek-1.2.4a/configure.in
--- aspseek-1.2.4a.orig/configure.in    Wed Jul  4 04:37:36 2001
+++ aspseek-1.2.4a/configure.in Mon Sep 10 20:14:10 2001
@@ -24,6 +24,7 @@
 dnl Check for programs.
 
 dnl This is C++ package! --kir.
+AC_EXEEXT
 AC_LANG_CPLUSPLUS
 dnl AC_PROG_CC
 AC_PROG_CXX


Matt.
diff -ruN aspseek-1.2.4a.orig/include/defines.h aspseek-1.2.4a/include/defines.h
--- aspseek-1.2.4a.orig/include/defines.h       Wed Jul  4 04:38:02 2001
+++ aspseek-1.2.4a/include/defines.h    Tue Sep  4 17:09:09 2001
@@ -127,6 +127,7 @@
 #define HTTP_STATUS_REDIRECT   301
 #define HTTP_STATUS_NOT_MODIFIED       304
 #define HTTP_STATUS_DELETE             400
+#define HTTP_STATUS_NOT_FOUND          404
 #define HTTP_STATUS_RETRY              503
 #define HTTP_STATUS_BAD_REQUEST        400
 #define HTTP_STATUS_UNAVAIL            503
diff -ruN aspseek-1.2.4a.orig/src/parse.cpp aspseek-1.2.4a/src/parse.cpp
--- aspseek-1.2.4a.orig/src/parse.cpp   Wed Jul  4 04:38:01 2001
+++ aspseek-1.2.4a/src/parse.cpp        Tue Sep  4 18:28:01 2001
@@ -338,6 +338,73 @@
 
 static int NET_BUF_SIZE = 4096;
 
+char *str_rtrim (register char *src)
+{
+       if (src && src[0])
+       {
+               register char *p, *q;
+               register int len = strlen(src);
+               p = q = src;
+               q += len;
+               if (q > p)
+               {
+                       q--;
+                       while (isspace(*q) && (q > p))
+                       {
+                               q--;
+                       }
+                       *(q + 1) = 0;
+               }
+       }
+       return (src);
+}
+
+char *str_ltrim (register char *src)
+{
+       while (src && isspace(*src))
+       {
+               src++;
+       }
+       return (src);
+}
+
+char *str_trim (register char *src)
+{
+       src = str_ltrim(src);
+       src = str_rtrim(src);
+       return (src);
+}
+
+char *strcasestr (const char *haystack, const char *needle)
+{
+        char *res = NULL;
+        if (haystack && needle)
+        {
+                int match = 0;
+                char *orig_needle = (char *) needle;
+                while (*haystack && *needle)
+                {
+                        if (tolower(*needle) == tolower(*haystack))
+                        {
+                                if (!match)
+                                {
+                                        res = (char *) haystack;
+                                        match = 1;
+                                }
+                                needle++;
+                        }
+                        else if (match)
+                        {
+                                res = NULL;
+                                match = 0;
+                                needle = orig_needle;
+                        }
+                        haystack++;
+                }
+        }
+        return (res);
+}
+
 char* Find2CRs(char* buf, int size)
 {
        char* f = buf;
@@ -414,8 +481,19 @@
        else if (!strcmp(m_schema, "ftp") && !CurSrv->m_proxy.size())
        {
                logger.log(CAT_ALL, L_WARN, "We don't support ftp protocol without 
proxy\n");
+               wordCache.DeleteWordsFromURL(doc.m_urlID, doc.m_siteID, CurSrv);
+               wordCache.m_database->MarkDeleted(doc.m_urlID);
+               return IND_OK;
+       }
+#ifndef USE_HTTPS
+       else if (!strcmp(m_schema, "https"))
+       {
+               logger.log(CAT_ALL, L_WARN, "The https protocol is not supported in 
+this build\n");
+               wordCache.DeleteWordsFromURL(doc.m_urlID, doc.m_siteID, CurSrv);
+               wordCache.m_database->MarkDeleted(doc.m_urlID);
                return IND_OK;
        }
+#endif
        // MaxDocsPerServer
        // We only want to get max docs, -1 for no limit
        if ((CurSrv->m_server_maxdocs != -1) && (CurSrv->m_server_cntdocs > 
CurSrv->m_server_maxdocs))
@@ -660,38 +738,95 @@
                break;
 
        case HTTP_STATUS_REDIRECT: /* We'll try to use Location: xxx instead */
+               if (!strcmp(m_filename,"robots.txt"))
+               {       // Special case: we pretend we got a 404 since robots.txt
+                       // should never redirect.
+                       status = HTTP_STATUS_NOT_FOUND;
+                       wordCache.m_database->DeleteRobotsFromHost(m_hostinfo);
+                       wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+                       ucontent.UpdateUrl(status, CurSrv->m_period);
+               }
+               else
                {
-               ULONG hrID = 0;
-               if ((doc.m_hops < CurSrv->m_maxhops) && location)
-               {
-                       int newMethod;
-                       newMethod = FilterType(location, reason, &wordCache);
-                       if (newMethod != DISALLOW)
+                       ULONG hrID = 0;
+                       if ((doc.m_hops < CurSrv->m_maxhops) && location)
                        {
+                               // According to RFC 2616 rhs is absoluteURI, however 
+we need
+                               // to be a little flexible here all the same. -matt
                                CUrl newURL;
-                               if (!newURL.ParseURL(location))
+                               char *location_trim = str_trim(location);
+                               if (!newURL.ParseURL(location_trim))
                                {
-                                       if (newURL.FindServer())
+                                       int newMethod;
+                                       char srv[STRSIZ], str[STRSIZ];
+                                       char *newschema, *host, *path;
+                                       if (newURL.m_schema[0]) newschema = 
+newURL.m_schema;
+                                       else newschema = m_schema;
+                                       if (!strcmp(newschema, "file") || 
+!strcmp(newschema, "htdb"))
                                        {
-                                               char srv[STRSIZ];
-                                               sprintf(srv, "%s://%s/", 
newURL.m_schema, newURL.m_hostinfo);
-                                               if (CurSrv->m_delete_no_server || 
CurSrv->m_outside || !strcmp(m_hostinfo, newURL.m_hostinfo))
+                                               host = NULL;
+                                               path = newURL.m_path[0] ? 
+newURL.m_path : m_path;
+                                               sprintf(str, "%s:%s%s", newschema, 
+path, newURL.m_filename);
+                                               sprintf(srv, "%s:%s/", newschema, 
+path);
+                                       }
+                                       else
+                                       {
+                                               host = newURL.m_hostinfo[0] ? 
+newURL.m_hostinfo : m_hostinfo;
+                                               path = newURL.m_path[0] ? 
+newURL.m_path : m_path;
+                                               sprintf(str, "%s://%s%s%s", newschema, 
+host, path, newURL.m_filename);
+                                               sprintf(srv, "%s://%s/", newschema, 
+host);
+                                       }
+                                       Remove2Dot(str);
+                                       if(!STRNCMP(str, "ftp://";) && 
+(strstr(str,";type=")))
+                                               *(strstr(str, ";type"))=0;
+       
+                                       newMethod = FilterType(str, reason, 
+&wordCache);
+       
+                                       if ((newMethod != DISALLOW))
+                                       {
+                                               if (host && (FindRobots(host, path, 
+newURL.m_filename) >= 0) && CurSrv->m_userobots)
                                                {
-                                                       CLocker lock(&wordCache);
-                                                       hrID = 
wordCache.GetHref(location, CurSrv, doc.m_urlID, doc.m_hops + 1, srv);
-                                                       if (hrID == doc.m_urlID)
+                                               }
+                                               else
+                                               {
+                                                       int add=1;
+                                                       /* compare hostinfo in some 
+cases */
+                                                       if 
+(((!CurSrv->m_delete_no_server) && (!CurSrv->m_outside)) && (Sites.find(srv) == 
+Sites.end()))
                                                        {
-                                                               hrID = 0;
+                                                               add = 
+!strcmp(m_schema, newschema);
+                                                               if (add && host)
+                                                               {
+                                                                       add = 
+!strcmp(m_hostinfo, host);
+                                                               }
+                                                       }
+                                                       if (add)
+                                                       {
+                                                               CLocker 
+lock(&wordCache);
+       
+                                                               hrID = 
+wordCache.GetHref(str, CurSrv, doc.m_urlID, doc.m_hops + 1, srv);
+                                                               if (hrID == 
+doc.m_urlID)
+                                                               {
+                                                                       hrID = 0;
+                                                               }
+       
+                                                               /* Add robots.txt for 
+HTTP schema */
+                                                               /* When FollowOutside 
+or DeleteNoServer no */
+                                                               if 
+((!strcmp(newschema, "http")) && CurSrv->m_userobots &&
+                                                                       
+(CurSrv->m_outside || (!CurSrv->m_delete_no_server)))
+                                                               {
+                                                                       char 
+str1[STRSIZ];
+                                                                       sprintf(str1, 
+"%s://%s/%s", newschema, host, "robots.txt");
+                                                                       
+wordCache.GetHref(str1, NULL, 0, 0, srv, 86400);
+                                                               }
                                                        }
                                                }
                                        }
                                }
                        }
+                       ucontent.UpdateUrl(status, CurSrv->m_period, hrID);
                }
-               ucontent.UpdateUrl(status, CurSrv->m_period, hrID);
                result = IND_OK;
                break;
-               }
 
        case HTTP_STATUS_NOT_MODIFIED: /* Not Modified, nothing to do */
                ucontent.UpdateUrl(status, CurSrv->m_period);
@@ -764,10 +899,34 @@
 #endif /* USE_EXT_CONV
  */
 
-       if (!STRNCASECMP(content_type, "text/plain") && 
(!strcmp(m_filename,"robots.txt")))
-       {
-               result = wordCache.ParseRobots(content, m_hostinfo);
-               if (result != IND_ERROR) 
wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+       //if (!STRNCASECMP(content_type, "text/plain") && 
+(!strcmp(m_filename,"robots.txt")))
+
+       // Hmmmh, don't look at content type!  If it's robots.txt we always (!) want 
+to
+       // drop in here...  Why?  Because it is common for sites to have default Error
+       // Document handlers that output (with status 200) their main website page.
+       // The commented out test above would result in erronious "robots.txt" pages
+       // being indexed (and subsequently appearing in search results), which, 
+really,
+       // we don't want! :)  No one should ever serve out anything other that robot 
+info
+       // under this filename and it should be safe to assume this to always be the
+       // case ...
+       if (!strcmp(m_filename,"robots.txt"))
+       {
+               // Ok, now we're here, if we didn't get content_type "text/plain" 
+assume
+               // site has a ErrorDocument handler that outputs their main page or 
+such
+               // which of course we do not want to do a ParseRobots on...
+               if (!STRNCASECMP(content_type, "text/plain"))
+               {
+                       result = wordCache.ParseRobots(content, m_hostinfo);
+                       if (result != IND_ERROR) 
+wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+               }
+               else
+               {       // We pretend we got a 404 otherwise.
+                       status = HTTP_STATUS_NOT_FOUND;
+                       wordCache.m_database->DeleteRobotsFromHost(m_hostinfo);
+                       wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+                       ucontent.UpdateUrl(status, CurSrv->m_period);
+                       return IND_OK;
+               }
        }
        else
        // plain text or something like that
@@ -1184,7 +1343,7 @@
        else
        {
                int def_port = 80;
-#ifdef USE_SSL
+#ifdef USE_HTTPS
                // Default port for https is 443
                if (!strcmp(m_schema, "https"))
                        def_port = 443;
@@ -1770,9 +1929,16 @@
                                if (!strcasecmp(tag.m_equiv, "refresh"))
                                {
                                        char *u;
-                                       Tolower((unsigned char*)tag.m_content, 
CHARSET_USASCII);
-                                       if ((u = strstr(tag.m_content, "url=")))
-                                               href = strdup(u+4);
+                                       if ((u = strcasestr(tag.m_content, "url")))
+                                       {
+                                               if ((u = str_ltrim(u + 3)))
+                                               {
+                                                       if (*u == '=')
+                                                       {
+                                                               href = strdup(u + 1);
+                                                       }
+                                               }
+                                       }
                                }
                                else if (!strcasecmp(tag.m_equiv, "keywords"))
                                {
@@ -1834,10 +2000,11 @@
                        }
                        if (href && follow && doc && CurSrv->m_gfollow)
                        {       
+                               char *href_trim = str_trim(href);
                                if (doc->m_hops >= CurSrv->m_maxhops)
                                {
                                }
-                               else if (!newURL.ParseURL(href))
+                               else if (!newURL.ParseURL(href_trim))
                                {
                                        char srv[STRSIZ];
                                        char* host;
@@ -1856,9 +2023,7 @@
                                        {
                                                host = newURL.m_hostinfo[0] ? 
newURL.m_hostinfo : curURL->m_hostinfo;
                                                path = newURL.m_path[0] ? 
newURL.m_path : curURL->m_path;
-                                               sprintf(str, "%s://%s%s%s",
-                                               newURL.m_schema[0] ? newURL.m_schema : 
curURL->m_schema,
-                                               host, path, newURL.m_filename);
+                                               sprintf(str, "%s://%s%s%s", newschema, 
+host, path, newURL.m_filename);
                                                sprintf(srv, "%s://%s/", newschema, 
host);
                                        }
                                        Remove2Dot(str);
@@ -1887,7 +2052,7 @@
                                                        if (add)
                                                        {
                                                                CLocker 
lock(ucontent->m_cache);
-                                                               
+
                                                                /* Add URL itself */
                                                                ULONG hrID = 
ucontent->m_cache->GetHref(str, CurSrv, doc->m_urlID, doc->m_hops + 1, srv);
                                                                if ((hrID != 0) && 
(hrID != doc->m_urlID))
@@ -1897,11 +2062,11 @@
                                                                
                                                                /* Add robots.txt for 
HTTP schema */
                                                                /* When FollowOutside 
or DeleteNoServer no */
-                                                               if 
((!strcmp(newURL.m_schema,"http")) && CurSrv->m_userobots &&
+                                                               if 
+((!strcmp(newschema, "http")) && CurSrv->m_userobots &&
                                                                (CurSrv->m_outside || 
(!CurSrv->m_delete_no_server)))
                                                                {
-                                                                       sprintf(str1, 
"%s://%s/%s",newURL.m_schema, newURL.m_hostinfo, "robots.txt");
-                                                                       
ucontent->m_cache->GetHref(str1, NULL, doc->m_urlID, doc->m_hops + 1, srv, 1);
+                                                                       sprintf(str1, 
+"%s://%s/%s", newschema, host, "robots.txt");
+                                                                       
+ucontent->m_cache->GetHref(str1, NULL, 0, 0, srv, 86400);
                                                                }
                                                        }
                                                }