On Fri, Jun 14, 2002 at 06:03:09PM -0500, Gilles Detillieux wrote:

> I'd recommend two changes:
> 1) Grab the most recent 3.2.0b4 snapshot
> 2) The HtFile::Request() and Document::RetrieveLocal() methods both
> have some hardcoded extensions, which should probably be kept in the
> new HtFile::Ext2Mime() method.  HtFile::Request() currently falls back
> on these when it can't open mime.types.


Greetings,

Below is the patch against 3.2.0b4-20020616.  This includes the
hardcoded types, and  bad_local_extensions  to allow  .php  etc. not
to be parsed locally.  If  bad_local_extensions  is explicitly set
empty, do you think it would be good to allow *all* files to be parsed
locally (even those with no extensions)?  Of course, ones for which no
MIME type is known would have to be treated as  text/plain  but it
would be good if a site has a lot of text files with no extensions.

Also, would there be any demand to index compressed files?  If someone
has a lot of  .ps.gz  files, for example, it could be useful to include
them in the index.

Finally, I think someon has been editing the files with a tab size other
than 8...  Is there a policy on that?

Cheers,
Lachlan


*** htdig/Document.cc   Sun Jan 13 19:13:13 2002
--- htdig/Document.cc.lha       Mon Jun 24 01:06:48 2002
***************
*** 72,78 ****
      FileConnect = 0;
      NNTPConnect = 0;
      externalConnect = 0;
!       HtConfiguration* config= HtConfiguration::config();
  
      // We probably need to move assignment of max_doc_size, according
      // to a server or url configuration value. The same is valid for
--- 72,78 ----
      FileConnect = 0;
      NNTPConnect = 0;
      externalConnect = 0;
!     HtConfiguration* config= HtConfiguration::config();
  
      // We probably need to move assignment of max_doc_size, according
      // to a server or url configuration value. The same is valid for
***************
*** 549,555 ****
  Transport::DocStatus
  Document::RetrieveLocal(HtDateTime date, StringList *filenames)
  {
!       HtConfiguration* config= HtConfiguration::config();
      struct stat stat_buf;
      String *filename;
  
--- 549,555 ----
  Transport::DocStatus
  Document::RetrieveLocal(HtDateTime date, StringList *filenames)
  {
!     HtConfiguration* config= HtConfiguration::config();
      struct stat stat_buf;
      String *filename;
  
***************
*** 558,564 ****
      // Loop through list of potential filenames until the list is exhausted
      // or a suitable file is found to exist as a regular file.
      while ((filename = (String *)filenames->Get_Next()) &&
!          ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
          if (debug > 1)
            cout << "  tried local file " << *filename << endl;
      
--- 558,564 ----
      // Loop through list of potential filenames until the list is exhausted
      // or a suitable file is found to exist as a regular file.
      while ((filename = (String *)filenames->Get_Next()) &&
!        ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
          if (debug > 1)
            cout << "  tried local file " << *filename << endl;
      
***************
*** 572,593 ****
      if (modtime <= date)
        return Transport::Document_not_changed;
  
-     // Process only HTML files (this could be changed if we read
-     // the server's mime.types file).
-     // (...and handle a select few other types for now...  this should
-     //  eventually be handled by the "file://..." handler, which uses
-     //  mime.types to determine the file type.) -- FIXME!!
      char *ext = strrchr((char*)*filename, '.');
      if (ext == NULL)
        return Transport::Document_not_local;
!     if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
!         contentType = "text/html";
!     else if ((mystrcasecmp(ext, ".txt") == 0) || (mystrcasecmp(ext, ".asc") == 0))
!         contentType = "text/plain";
!     else if ((mystrcasecmp(ext, ".pdf") == 0))
!         contentType = "application/pdf";
!     else if ((mystrcasecmp(ext, ".ps") == 0) || (mystrcasecmp(ext, ".eps") == 0))
!         contentType = "application/postscript";
      else 
        return Transport::Document_not_local;
  
--- 572,585 ----
      if (modtime <= date)
        return Transport::Document_not_changed;
  
      char *ext = strrchr((char*)*filename, '.');
+     if (ext && strchr(ext,'/'))               // Ignore a dot if it's not in the
+       ext = NULL;                     // final component of the path.
      if (ext == NULL)
        return Transport::Document_not_local;
!     const String *type = HtFile::Ext2Mime (ext + 1);
!     if (type != NULL)
!       contentType = *type;
      else 
        return Transport::Document_not_local;
  
*** htnet/HtFile.h      Mon Jun 24 01:02:42 2002
--- htnet/HtFile.h.lha  Mon Jun 24 01:02:51 2002
***************
*** 64,69 ****
--- 64,73 ----
     // manages a Transport request (method inherited from Transport class)
     virtual DocStatus Request ();
  
+    // Determine Mime type of file
+    // (Does it belong here??)
+    static const String *Ext2Mime (const char *);
+ 
   ///////
      //    Interface for resource retrieving
   ///////
*** htnet/HtFile.cc     Sun Dec 23 19:13:14 2001
--- htnet/HtFile.cc.lha Mon Jun 24 00:48:34 2002
***************
*** 76,96 ****
  }
  
  
! ///////
!    //    Manages the requesting process
! ///////
! 
! HtFile::DocStatus HtFile::Request()
  {
-    HtConfiguration* config= HtConfiguration::config();
     static Dictionary *mime_map = 0;
  
     if (!mime_map)
       {
         mime_map = new Dictionary();
         ifstream in(config->Find("mime_types").get());
         if (in)
           {
             String line;
             while (in >> line)
               {
--- 76,110 ----
  }
  
  
! // Return mime type indicated by extension  ext  (which is assumed not
! // to contain the '.'), or  NULL  if  ext  is not a know mime type, or
! // is listed in  bad_local_extensions.
! const String *HtFile::Ext2Mime (const char *ext)
  {
     static Dictionary *mime_map = 0;
  
     if (!mime_map)
       {
+        HtConfiguration* config= HtConfiguration::config();
         mime_map = new Dictionary();
+        if (!mime_map)
+        return NULL;
+ 
+        if (debug > 2)
+           cout << "MIME types: " << config->Find("mime_types").get() << endl;
         ifstream in(config->Find("mime_types").get());
         if (in)
           {
+          // Set up temporary dictionary of extensions not to parse locally
+          Dictionary bad_local_exts;
+          StringList split_exts(config->Find("bad_local_extensions"), "\t .");
+          for (int i = 0; i < split_exts.Count(); i++)
+          {
+             if (debug > 3)
+               cout << "Bad local extension: " << split_exts[i] << endl;
+             bad_local_exts.Add(split_exts[i], 0);
+          }
+ 
             String line;
             while (in >> line)
               {
***************
*** 99,114 ****
                 if ((cmt = line.indexOf('#')) >= 0)
                   line = line.sub(0, cmt);
                 StringList split_line(line, "\t ");
!                // Let's cache mime type to lesser the number of 
!                // operator [] callings
                 String mime_type = split_line[0];
                 // Fill map with values.
                 for (int i = 1; i < split_line.Count(); i++)
!                  mime_map->Add(split_line[i], new String(mime_type));
               }
           }
       }
  
     // Reset the response
     _response.Reset();
     
--- 113,161 ----
                 if ((cmt = line.indexOf('#')) >= 0)
                   line = line.sub(0, cmt);
                 StringList split_line(line, "\t ");
!                // cache mime type to lessen the number of operator [] callings
                 String mime_type = split_line[0];
                 // Fill map with values.
                 for (int i = 1; i < split_line.Count(); i++)
!              {
!                const char *ext = split_line [i];
!                if (bad_local_exts.Exists(ext))
!                {
!                  if (debug > 3)
!                    cout << "Bad local extension: " << ext << endl;
!                  continue;
!                }
! 
!                if (debug > 3)
!                  cout << "MIME: " << ext << "\t-> " << mime_type << endl;
!                  mime_map->Add(ext, new String(mime_type));
!              }
               }
           }
+        else
+        {
+          if (debug > 2)
+               cout << "MIME types file not found.  Using default types.\n";
+          mime_map->Add(String("html"), new String("text/html"));
+          mime_map->Add(String("htm"),  new String("text/html"));
+          mime_map->Add(String("txt"),  new String("text/plain"));
+          mime_map->Add(String("asc"),  new String("text/plain"));
+          mime_map->Add(String("pdf"),  new String("application/pdf"));
+          mime_map->Add(String("ps"),   new String("application/postscript"));
+          mime_map->Add(String("eps"),  new String("application/postscript"));
+        }
       }
  
+    // return MIME type, or NULL if not found
+    return (String *)mime_map->Find(ext);
+ }
+ 
+ ///////
+    //    Manages the requesting process
+ ///////
+ 
+ HtFile::DocStatus HtFile::Request()
+ {
     // Reset the response
     _response.Reset();
     
***************
*** 166,191 ****
       return Transport::Document_not_changed;
  
     char *ext = strrchr(_url.path(), '.');
     if (ext == NULL)
       return Transport::Document_not_local;
  
!    if (mime_map && mime_map->Count())
!      {
!        String *mime_type = (String *)mime_map->Find(ext + 1);
!        if (mime_type)
!          _response._content_type = *mime_type;
!        else
!          return Transport::Document_not_local;
!      }
     else
!      {
!        if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
!          _response._content_type = "text/html";
!        else if (mystrcasecmp(ext, ".txt") == 0)
!          _response._content_type = "text/plain";
!        else
!          return Transport::Document_not_local;
!      }
  
     _response._modification_time = new HtDateTime(stat_buf.st_mtime);
  
--- 213,228 ----
       return Transport::Document_not_changed;
  
     char *ext = strrchr(_url.path(), '.');
+    if (ext && strchr(ext,'/'))                // Ignore a dot if it's not in the
+      ext = NULL;                      // final component of the path.
     if (ext == NULL)
       return Transport::Document_not_local;
  
!    const String *mime_type = Ext2Mime(ext + 1);
!    if (mime_type)
!      _response._content_type = *mime_type;
     else
!      return Transport::Document_not_local;
  
     _response._modification_time = new HtDateTime(stat_buf.st_mtime);
  
*** htcommon/defaults.cc        Sun Jun 23 23:55:41 2002
--- htcommon/defaults.cc.lha    Mon Jun 24 01:01:09 2002
***************
*** 145,151 ****
        documents as text while they are some binary format. \
        If the list is empty, then all extensions are acceptable, \
        provided they pass other criteria for acceptance or rejection. \
!       See also <a href=\"#valid_extensions\">valid_extensions</a>. \
  " }, \
  { "bad_querystr", "",  \
        "pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: 
forum=private section=topsecret&amp;passwd=required", " \
--- 145,165 ----
        documents as text while they are some binary format. \
        If the list is empty, then all extensions are acceptable, \
        provided they pass other criteria for acceptance or rejection. \
!       See also <a href=\"#valid_extensions\">valid_extensions</a> and \
!       <a href=\"#bad_local_extensions\">bad_local_extensions</a>. \
! " }, \
! { "bad_local_extensions", ".php .shtml",  \
!       "string list", "htdig", "URL", "all", "Indexing:Where", "bad_local_extensions: 
.php .foo .bar", " \
!       This is a list of extensions on URLs which are \
!       considered active, that is, the content delivered by the web \
!       server is not simply the text of the file, but is generated \
!       on-the-fly. This list is used mainly to allow URLs on the local \
!       machine to be read using the local filesystem, rather than \
!       through HTTP.  \
!       If the list is empty, then all extensions are acceptable, \
!       provided they pass other criteria for acceptance or rejection. \
!       See also <a href=\"#valid_extensions\">valid_extensions</a> and \
!       <a href=\"#bad_extensions\">bad_extensions</a>. \
  " }, \
  { "bad_querystr", "",  \
        "pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: 
forum=private section=topsecret&amp;passwd=required", " \

-- 
Lachlan Andrew  [EMAIL PROTECTED]  Phone: +613 8344-3816 Fax: +613 8344-6678
Department of Electrical and Electronic Engineering        CRICOS Provider Code
University of Melbourne, Victoria, 3010    AUSTRALIA            00116K


-------------------------------------------------------
Sponsored by:
ThinkGeek at http://www.ThinkGeek.com/
_______________________________________________
htdig-dev mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/htdig-dev

Reply via email to