Matt Sullivan
Mon, 10 Sep 2001 05:29:43 -0700
Kir / All, Attached are a couple of patches to problems that I've discovered (mostly) in the indexer over the last few weeks. Details are: 1. Fixed mistaken indexing of robots.txt. Some web sites output content (such as their home page) on what should be 404 errors but with status 200. Since Content-Type and filename were both checked when indexing robots.txt it was possible that robots.txt files were actually indexed mistakenly in these instances (since mime type is not text/plain). Patch forces status 404 if filename is robots.txt and content type is not text/plain. 2. Minor fixes when HTTPS support not compiled in. Was possible for (supposedly) HTTPS files to be indexed mistakenly even with HTTPS support not compiled in. This would only occur when the exact same filepath was available via both HTTP and HTTPS protocols (in other words URL "https://someserver/somepath/" would be added and indexed over port 80 mistakenly). 3. Fixed broken redirect handling. Although RFC 2616 states that the RHS of a "Location" header should be an AbsoluteURI, it is very common for partial/relative URI's to be used in a Location header. ASPSeek would not follow partial URI's in redirects (both header and meta based). Added whitespace stripping and qualifying of partial URI's to support redirects proper. Redirects are now followed in exactly the same manner as HREF's in a document (including addition of robot.txt when follow & followoutside, note below). 4. Fixed Tolower call in META handling that caused URI to be lower cased thus breaking redirect in case where remote system was case sensitive in it's URI handling. 5. Fixed robots.txt URL parameter addition in HREF and Redirect code to correctly add robots.txt with referrer 0, hops count 0 and next index time 24hrs < now (robots.txt were not added at all in redirect case - which caused problems with FollowOutside on). 6. Also (not included in attached patch) newer autoconf needs addition of AC_EXEEXT macro in configure.in prior to AC_LANG_CPLUSPLUS etc. Effect of missing macro is EXE extension mistakenly set to ".C". Patch is: diff -ruN aspseek-1.2.4a.orig/configure.in aspseek-1.2.4a/configure.in --- aspseek-1.2.4a.orig/configure.in Wed Jul 4 04:37:36 2001 +++ aspseek-1.2.4a/configure.in Mon Sep 10 20:14:10 2001 @@ -24,6 +24,7 @@ dnl Check for programs. dnl This is C++ package! --kir. +AC_EXEEXT AC_LANG_CPLUSPLUS dnl AC_PROG_CC AC_PROG_CXX Matt.
diff -ruN aspseek-1.2.4a.orig/include/defines.h aspseek-1.2.4a/include/defines.h
--- aspseek-1.2.4a.orig/include/defines.h Wed Jul 4 04:38:02 2001
+++ aspseek-1.2.4a/include/defines.h Tue Sep 4 17:09:09 2001
@@ -127,6 +127,7 @@
#define HTTP_STATUS_REDIRECT 301
#define HTTP_STATUS_NOT_MODIFIED 304
#define HTTP_STATUS_DELETE 400
+#define HTTP_STATUS_NOT_FOUND 404
#define HTTP_STATUS_RETRY 503
#define HTTP_STATUS_BAD_REQUEST 400
#define HTTP_STATUS_UNAVAIL 503
diff -ruN aspseek-1.2.4a.orig/src/parse.cpp aspseek-1.2.4a/src/parse.cpp
--- aspseek-1.2.4a.orig/src/parse.cpp Wed Jul 4 04:38:01 2001
+++ aspseek-1.2.4a/src/parse.cpp Tue Sep 4 18:28:01 2001
@@ -338,6 +338,73 @@
static int NET_BUF_SIZE = 4096;
+char *str_rtrim (register char *src)
+{
+ if (src && src[0])
+ {
+ register char *p, *q;
+ register int len = strlen(src);
+ p = q = src;
+ q += len;
+ if (q > p)
+ {
+ q--;
+ while (isspace(*q) && (q > p))
+ {
+ q--;
+ }
+ *(q + 1) = 0;
+ }
+ }
+ return (src);
+}
+
+char *str_ltrim (register char *src)
+{
+ while (src && isspace(*src))
+ {
+ src++;
+ }
+ return (src);
+}
+
+char *str_trim (register char *src)
+{
+ src = str_ltrim(src);
+ src = str_rtrim(src);
+ return (src);
+}
+
+char *strcasestr (const char *haystack, const char *needle)
+{
+ char *res = NULL;
+ if (haystack && needle)
+ {
+ int match = 0;
+ char *orig_needle = (char *) needle;
+ while (*haystack && *needle)
+ {
+ if (tolower(*needle) == tolower(*haystack))
+ {
+ if (!match)
+ {
+ res = (char *) haystack;
+ match = 1;
+ }
+ needle++;
+ }
+ else if (match)
+ {
+ res = NULL;
+ match = 0;
+ needle = orig_needle;
+ }
+ haystack++;
+ }
+ }
+ return (res);
+}
+
char* Find2CRs(char* buf, int size)
{
char* f = buf;
@@ -414,8 +481,19 @@
else if (!strcmp(m_schema, "ftp") && !CurSrv->m_proxy.size())
{
logger.log(CAT_ALL, L_WARN, "We don't support ftp protocol without
proxy\n");
+ wordCache.DeleteWordsFromURL(doc.m_urlID, doc.m_siteID, CurSrv);
+ wordCache.m_database->MarkDeleted(doc.m_urlID);
+ return IND_OK;
+ }
+#ifndef USE_HTTPS
+ else if (!strcmp(m_schema, "https"))
+ {
+ logger.log(CAT_ALL, L_WARN, "The https protocol is not supported in
+this build\n");
+ wordCache.DeleteWordsFromURL(doc.m_urlID, doc.m_siteID, CurSrv);
+ wordCache.m_database->MarkDeleted(doc.m_urlID);
return IND_OK;
}
+#endif
// MaxDocsPerServer
// We only want to get max docs, -1 for no limit
if ((CurSrv->m_server_maxdocs != -1) && (CurSrv->m_server_cntdocs >
CurSrv->m_server_maxdocs))
@@ -660,38 +738,95 @@
break;
case HTTP_STATUS_REDIRECT: /* We'll try to use Location: xxx instead */
+ if (!strcmp(m_filename,"robots.txt"))
+ { // Special case: we pretend we got a 404 since robots.txt
+ // should never redirect.
+ status = HTTP_STATUS_NOT_FOUND;
+ wordCache.m_database->DeleteRobotsFromHost(m_hostinfo);
+ wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+ ucontent.UpdateUrl(status, CurSrv->m_period);
+ }
+ else
{
- ULONG hrID = 0;
- if ((doc.m_hops < CurSrv->m_maxhops) && location)
- {
- int newMethod;
- newMethod = FilterType(location, reason, &wordCache);
- if (newMethod != DISALLOW)
+ ULONG hrID = 0;
+ if ((doc.m_hops < CurSrv->m_maxhops) && location)
{
+ // According to RFC 2616 rhs is absoluteURI, however
+we need
+ // to be a little flexible here all the same. -matt
CUrl newURL;
- if (!newURL.ParseURL(location))
+ char *location_trim = str_trim(location);
+ if (!newURL.ParseURL(location_trim))
{
- if (newURL.FindServer())
+ int newMethod;
+ char srv[STRSIZ], str[STRSIZ];
+ char *newschema, *host, *path;
+ if (newURL.m_schema[0]) newschema =
+newURL.m_schema;
+ else newschema = m_schema;
+ if (!strcmp(newschema, "file") ||
+!strcmp(newschema, "htdb"))
{
- char srv[STRSIZ];
- sprintf(srv, "%s://%s/",
newURL.m_schema, newURL.m_hostinfo);
- if (CurSrv->m_delete_no_server ||
CurSrv->m_outside || !strcmp(m_hostinfo, newURL.m_hostinfo))
+ host = NULL;
+ path = newURL.m_path[0] ?
+newURL.m_path : m_path;
+ sprintf(str, "%s:%s%s", newschema,
+path, newURL.m_filename);
+ sprintf(srv, "%s:%s/", newschema,
+path);
+ }
+ else
+ {
+ host = newURL.m_hostinfo[0] ?
+newURL.m_hostinfo : m_hostinfo;
+ path = newURL.m_path[0] ?
+newURL.m_path : m_path;
+ sprintf(str, "%s://%s%s%s", newschema,
+host, path, newURL.m_filename);
+ sprintf(srv, "%s://%s/", newschema,
+host);
+ }
+ Remove2Dot(str);
+ if(!STRNCMP(str, "ftp://") &&
+(strstr(str,";type=")))
+ *(strstr(str, ";type"))=0;
+
+ newMethod = FilterType(str, reason,
+&wordCache);
+
+ if ((newMethod != DISALLOW))
+ {
+ if (host && (FindRobots(host, path,
+newURL.m_filename) >= 0) && CurSrv->m_userobots)
{
- CLocker lock(&wordCache);
- hrID =
wordCache.GetHref(location, CurSrv, doc.m_urlID, doc.m_hops + 1, srv);
- if (hrID == doc.m_urlID)
+ }
+ else
+ {
+ int add=1;
+ /* compare hostinfo in some
+cases */
+ if
+(((!CurSrv->m_delete_no_server) && (!CurSrv->m_outside)) && (Sites.find(srv) ==
+Sites.end()))
{
- hrID = 0;
+ add =
+!strcmp(m_schema, newschema);
+ if (add && host)
+ {
+ add =
+!strcmp(m_hostinfo, host);
+ }
+ }
+ if (add)
+ {
+ CLocker
+lock(&wordCache);
+
+ hrID =
+wordCache.GetHref(str, CurSrv, doc.m_urlID, doc.m_hops + 1, srv);
+ if (hrID ==
+doc.m_urlID)
+ {
+ hrID = 0;
+ }
+
+ /* Add robots.txt for
+HTTP schema */
+ /* When FollowOutside
+or DeleteNoServer no */
+ if
+((!strcmp(newschema, "http")) && CurSrv->m_userobots &&
+
+(CurSrv->m_outside || (!CurSrv->m_delete_no_server)))
+ {
+ char
+str1[STRSIZ];
+ sprintf(str1,
+"%s://%s/%s", newschema, host, "robots.txt");
+
+wordCache.GetHref(str1, NULL, 0, 0, srv, 86400);
+ }
}
}
}
}
}
+ ucontent.UpdateUrl(status, CurSrv->m_period, hrID);
}
- ucontent.UpdateUrl(status, CurSrv->m_period, hrID);
result = IND_OK;
break;
- }
case HTTP_STATUS_NOT_MODIFIED: /* Not Modified, nothing to do */
ucontent.UpdateUrl(status, CurSrv->m_period);
@@ -764,10 +899,34 @@
#endif /* USE_EXT_CONV
*/
- if (!STRNCASECMP(content_type, "text/plain") &&
(!strcmp(m_filename,"robots.txt")))
- {
- result = wordCache.ParseRobots(content, m_hostinfo);
- if (result != IND_ERROR)
wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+ //if (!STRNCASECMP(content_type, "text/plain") &&
+(!strcmp(m_filename,"robots.txt")))
+
+ // Hmmmh, don't look at content type! If it's robots.txt we always (!) want
+to
+ // drop in here... Why? Because it is common for sites to have default Error
+ // Document handlers that output (with status 200) their main website page.
+ // The commented out test above would result in erronious "robots.txt" pages
+ // being indexed (and subsequently appearing in search results), which,
+really,
+ // we don't want! :) No one should ever serve out anything other that robot
+info
+ // under this filename and it should be safe to assume this to always be the
+ // case ...
+ if (!strcmp(m_filename,"robots.txt"))
+ {
+ // Ok, now we're here, if we didn't get content_type "text/plain"
+assume
+ // site has a ErrorDocument handler that outputs their main page or
+such
+ // which of course we do not want to do a ParseRobots on...
+ if (!STRNCASECMP(content_type, "text/plain"))
+ {
+ result = wordCache.ParseRobots(content, m_hostinfo);
+ if (result != IND_ERROR)
+wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+ }
+ else
+ { // We pretend we got a 404 otherwise.
+ status = HTTP_STATUS_NOT_FOUND;
+ wordCache.m_database->DeleteRobotsFromHost(m_hostinfo);
+ wordCache.m_database->LoadRobotsOfHost(m_hostinfo);
+ ucontent.UpdateUrl(status, CurSrv->m_period);
+ return IND_OK;
+ }
}
else
// plain text or something like that
@@ -1184,7 +1343,7 @@
else
{
int def_port = 80;
-#ifdef USE_SSL
+#ifdef USE_HTTPS
// Default port for https is 443
if (!strcmp(m_schema, "https"))
def_port = 443;
@@ -1770,9 +1929,16 @@
if (!strcasecmp(tag.m_equiv, "refresh"))
{
char *u;
- Tolower((unsigned char*)tag.m_content,
CHARSET_USASCII);
- if ((u = strstr(tag.m_content, "url=")))
- href = strdup(u+4);
+ if ((u = strcasestr(tag.m_content, "url")))
+ {
+ if ((u = str_ltrim(u + 3)))
+ {
+ if (*u == '=')
+ {
+ href = strdup(u + 1);
+ }
+ }
+ }
}
else if (!strcasecmp(tag.m_equiv, "keywords"))
{
@@ -1834,10 +2000,11 @@
}
if (href && follow && doc && CurSrv->m_gfollow)
{
+ char *href_trim = str_trim(href);
if (doc->m_hops >= CurSrv->m_maxhops)
{
}
- else if (!newURL.ParseURL(href))
+ else if (!newURL.ParseURL(href_trim))
{
char srv[STRSIZ];
char* host;
@@ -1856,9 +2023,7 @@
{
host = newURL.m_hostinfo[0] ?
newURL.m_hostinfo : curURL->m_hostinfo;
path = newURL.m_path[0] ?
newURL.m_path : curURL->m_path;
- sprintf(str, "%s://%s%s%s",
- newURL.m_schema[0] ? newURL.m_schema :
curURL->m_schema,
- host, path, newURL.m_filename);
+ sprintf(str, "%s://%s%s%s", newschema,
+host, path, newURL.m_filename);
sprintf(srv, "%s://%s/", newschema,
host);
}
Remove2Dot(str);
@@ -1887,7 +2052,7 @@
if (add)
{
CLocker
lock(ucontent->m_cache);
-
+
/* Add URL itself */
ULONG hrID =
ucontent->m_cache->GetHref(str, CurSrv, doc->m_urlID, doc->m_hops + 1, srv);
if ((hrID != 0) &&
(hrID != doc->m_urlID))
@@ -1897,11 +2062,11 @@
/* Add robots.txt for
HTTP schema */
/* When FollowOutside
or DeleteNoServer no */
- if
((!strcmp(newURL.m_schema,"http")) && CurSrv->m_userobots &&
+ if
+((!strcmp(newschema, "http")) && CurSrv->m_userobots &&
(CurSrv->m_outside ||
(!CurSrv->m_delete_no_server)))
{
- sprintf(str1,
"%s://%s/%s",newURL.m_schema, newURL.m_hostinfo, "robots.txt");
-
ucontent->m_cache->GetHref(str1, NULL, doc->m_urlID, doc->m_hops + 1, srv, 1);
+ sprintf(str1,
+"%s://%s/%s", newschema, host, "robots.txt");
+
+ucontent->m_cache->GetHref(str1, NULL, 0, 0, srv, 86400);
}
}
}