Well, I've done a bit of work on the default_type stuff and trying to
get htdig working with my somewhat nonstandard mail form of 1 file per
message. Someone could extend the mailparse.py script to do mail
spools but I can't think of a good way to index inside a mail spool.
The converter ignores that which is mime and is not of text/plain. It
was written with python 1.5.2
Patches are against 3.2.0b2 - there might be an extra cout here or
there.
I've written some messy patches to add a default_type attribute to
htdig in with support at the htdig/Retriever.cc level with a bit of
support in the htnet/HtFile.cc Didn't know where else it was
appropriate to put this type of thing and doing it over and over seems
kludgy in other places.
Please comment on this ( I will admit that I'm not even a
non-laughable C++ programmer - too many years since C++ and I last met
)
my htdig.conf:
database_dir: /home/sprout/tmp/htdig/db
start_url: http://localhost/files.html
local_urls: http://localhost/=/home/sprout/Mail/
local_urls_only: true
# default extension type for
# things we can't figure out any other way
#
# only use this option when you know what you are going to be parsing
# otherwise you will need a converter than handles anything
# default_type: application/nnml
default_type: text/plain
external_parsers: application/nnml->text/html \
/home/sprout/src/python/mailtests/mailparse.py
diff -ur src/htdig-3.2.0b3.orig/htdig/Document.cc src/htdig-3.2.0b3/htdig/Document.cc
--- src/htdig-3.2.0b3.orig/htdig/Document.cc Thu Feb 22 18:31:30 2001
+++ src/htdig-3.2.0b3/htdig/Document.cc Mon Jun 4 23:29:40 2001
@@ -492,7 +492,9 @@
// eventually be handled by the "file://..." handler, which uses
// mime.types to determine the file type.) -- FIXME!!
char *ext = strrchr((char*)*filename, '.');
- if (ext == NULL)
+ static String default_type = config["default_type"];
+
+ if ((ext == NULL) && (char *) default_type == NULL)
return Transport::Document_not_local;
if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
contentType = "text/html";
@@ -502,6 +504,8 @@
contentType = "application/pdf";
else if ((mystrcasecmp(ext, ".ps") == 0) || (mystrcasecmp(ext, ".eps") == 0))
contentType = "application/postscript";
+ else if ((char *) default_type != NULL)
+ contentType = default_type;
else
return Transport::Document_not_local;
diff -ur src/htdig-3.2.0b3.orig/htdig/Retriever.cc src/htdig-3.2.0b3/htdig/Retriever.cc
--- src/htdig-3.2.0b3.orig/htdig/Retriever.cc Thu Feb 22 18:31:30 2001
+++ src/htdig-3.2.0b3/htdig/Retriever.cc Mon Jun 4 23:17:58 2001
@@ -889,6 +889,9 @@
p = strtok(0, " \t");
}
+ // Is there a default type
+ static String default_type = config["default_type"];
+
static String url;
url = u;
@@ -942,14 +945,16 @@
return FALSE;
}
}
+
//
- // Or NOT in the list of valid ones
+ // Or NOT in the list of valid ones AND
+ // there is no default_type defined
//
- if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
+ if (ext && valids.Count() > 0 && !valids.Exists(lowerext) && ! (char *) default_type)
{
- if (debug > 2)
- cout << endl <<" Rejected: Extension is not valid!";
- return FALSE;
+ if (debug > 2)
+ cout << endl <<" Rejected: Extension is not valid!";
+ return FALSE;
}
//
@@ -968,7 +973,9 @@
//
// If any of the limits are met, we allow the URL
//
- if (limits.match(url, 1, 0) == 0) {
+
+ // This is a kludge - if theres a defualt type - skip this check.
+ if ((!(char *) default_type) && (limits.match(url, 1, 0) == 0)) {
if (debug > 2)
cout << endl <<" Rejected: URL not in the limits!";
return(FALSE);
@@ -1536,6 +1543,7 @@
cout << "\nurl rejected: (level 1)" << url.get() << endl;
if (debug == 1)
cout << '-';
+ cout << config["default_type"];
}
if (debug)
cout.flush();
diff -ur src/htdig-3.2.0b3.orig/htnet/HtFile.cc src/htdig-3.2.0b3/htnet/HtFile.cc
--- src/htdig-3.2.0b3.orig/htnet/HtFile.cc Thu Feb 22 18:31:34 2001
+++ src/htdig-3.2.0b3/htnet/HtFile.cc Mon Jun 4 23:00:46 2001
@@ -166,11 +166,18 @@
return Transport::Document_not_changed;
char *ext = strrchr(_url.path(), '.');
- if (ext == NULL)
- return Transport::Document_not_local;
- if (mime_map)
- {
+ static String default_type = config["default_type"];
+
+ if(ext == NULL) {
+ if ((char *) default_type == NULL) {
+ return Transport::Document_not_local;
+ } else {
+ _response._content_type = default_type;
+ }
+ }
+ else if (mime_map)
+ {
String *mime_type = (String *)mime_map->Find(ext + 1);
if (mime_type)
_response._content_type = *mime_type;
@@ -183,8 +190,10 @@
_response._content_type = "text/html";
else if (mystrcasecmp(ext, ".txt") == 0)
_response._content_type = "text/plain";
+ else if ((char *) default_type == NULL)
+ _response._content_type = default_type;
else
- return Transport::Document_not_local;
+ return Transport::Document_not_local;
}
_response._modification_time = new HtDateTime(stat_buf.st_mtime);
mailparse.py
--
Chris Green <[EMAIL PROTECTED]>
Let not the sands of time get in your lunch.