This patch adds restrict and exclude config attributes to htsearch, which
can be overridden by the restrict and exclude input parameters in the search
form. This can be used to avoid the requirement of listing restrictions
or exclusions in the search form, allowing you to put them in the config
file instead.
Note that if you apply both this patch and the build_select_lists.0
patch from the ftp.ccsf.org archive, you should remove the following
three lines from htsearch/Display.cc, in Display::setVariables() around
lines 512-514:
if (strcmp(builds[b+1], "restrict") == 0
|| strcmp(builds[b+1], "exclude") == 0)
sepc = '|';
With this patch, the restrict and exclude input parameters are no longer
modified in place, so the hook above will no longer work. This patch
was made with the help of Gabriele Bartolini. He committed it to CVS,
and I made a patch from it to support 3.1.5 until the next stable release
is out.
Apply in your main htdig-3.1.5 source directory using the command
"patch -p0 < this-message".
--- htcommon/defaults.cc.orig Tue Feb 15 16:05:44 2000
+++ htcommon/defaults.cc Thu May 31 07:44:43 2001
@@ -57,6 +57,7 @@ ConfigDefaults defaults[] =
{"endings_word2root_db", "${common_dir}/word2root.db"},
{"excerpt_length", "300"},
{"excerpt_show_top", "false"},
+ {"exclude", ""},
{"exclude_urls", "/cgi-bin/ .cgi"},
{"external_parsers", ""},
{"extra_word_characters", ""},
@@ -120,6 +121,7 @@ ConfigDefaults defaults[] =
{"prev_page_text", "[prev]"},
{"remove_bad_urls", "true"},
{"remove_default_doc", "index.html"},
+ {"restrict", ""},
{"robotstxt_name", "htdig"},
{"script_name", ""},
{"search_algorithm", "exact:1"},
--- htdoc/attrs.html.orig Fri Feb 25 10:18:47 2000
+++ htdoc/attrs.html Thu May 31 07:44:44 2001
@@ -1741,6 +1741,51 @@
<hr>
<dl>
<dt>
+ <strong><a name="exclude">exclude</a></strong>
+ </dt>
+ <dd>
+ <dl>
+ <dt>
+ <em>type:</em>
+ </dt>
+ <dd>
+ string list
+ </dd>
+ <dt>
+ <em>used by:</em>
+ </dt>
+ <dd>
+ <a href="htsearch.html">htsearch</a>
+ </dd>
+ <dt>
+ <em>default:</em>
+ </dt>
+ <dd>
+ <em><empty></em>
+ </dd>
+ <dt>
+ <em>description:</em>
+ </dt>
+ <dd>
+ If a URL contains any of the space separated patterns,
+ it will be discarded in the searching phase. This is
+ used to exclude certain URLs from search results.
+ The list can be specified from within the configuration
+ file, and can be overridden with the "exclude" input
+ parameter in the search form.
+ </dd>
+ <dt>
+ <em>example:</em>
+ </dt>
+ <dd>
+ exclude: cgi-bin
+ </dd>
+ </dl>
+ </dd>
+ </dl>
+ <hr>
+ <dl>
+ <dt>
<strong><a name="exclude_urls">exclude_urls</a></strong>
</dt>
<dd>
@@ -5037,6 +5082,57 @@
<hr>
<dl>
<dt>
+ <strong><a name="restrict">restrict</a></strong>
+ </dt>
+ <dd>
+ <dl>
+ <dt>
+ <em>type:</em>
+ </dt>
+ <dd>
+ string list
+ </dd>
+ <dt>
+ <em>used by:</em>
+ </dt>
+ <dd>
+ <a href="htsearch.html">htsearch</a>
+ </dd>
+ <dt>
+ <em>default:</em>
+ </dt>
+ <dd>
+ <em><empty></em>
+ </dd>
+ <dt>
+ <em>description:</em>
+ </dt>
+ <dd>
+ This specifies a set of patterns that all URLs have to
+ match against in order for them to be included in the
+ search results. Any number of strings can be specified,
+ separated by spaces. If multiple patterns are given, at
+ least one of the patterns has to match the URL.
+ The list can be specified from within the configuration
+ file, and can be overridden with the "restrict" input
+ parameter in the search form. Note that the restrict
+ list does not take precedence over the
+ <a href="#exclude">exclude</a> list - if a URL matches
+ patterns in both lists it is still excluded from the
+ search results.
+ </dd>
+ <dt>
+ <em>example:</em>
+ </dt>
+ <dd>
+ restrict: http://www.vh1.com/
+ </dd>
+ </dl>
+ </dd>
+ </dl>
+ <hr>
+ <dl>
+ <dt>
<strong><a name="robotstxt_name">
robotstxt_name</a></strong>
</dt>
--- htdoc/cf_byname.html.orig Tue Feb 15 15:59:53 2000
+++ htdoc/cf_byname.html Mon Jun 4 08:33:39 2001
@@ -56,6 +56,7 @@
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#endings_word2root_db">endings_word2root_db</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#excerpt_length">excerpt_length</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#excerpt_show_top">excerpt_show_top</a><br>
+ <img src="dot.gif" alt="*" width=9 height=9> <a target="body"
+href="attrs.html#exclude">exclude</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#exclude_urls">exclude_urls</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#external_parsers">external_parsers</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#extra_word_characters">extra_word_characters</a><br>
@@ -138,6 +139,7 @@
<b>R</b> <font face="helvetica,arial" size="2"><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#remove_bad_urls">remove_bad_urls</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#remove_default_doc">remove_default_doc</a><br>
+ <img src="dot.gif" alt="*" width=9 height=9> <a target="body"
+href="attrs.html#restrict">restrict</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#robotstxt_name">robotstxt_name</a><br>
</font> <br>
<b>S</b> <font face="helvetica,arial" size="2"><br>
--- htdoc/cf_byprog.html.orig Tue Feb 15 16:00:19 2000
+++ htdoc/cf_byprog.html Thu May 31 07:44:44 2001
@@ -132,6 +132,7 @@
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#endings_word2root_db">endings_word2root_db</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#excerpt_length">excerpt_length</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#excerpt_show_top">excerpt_show_top</a><br>
+ <img src="dot.gif" alt="*" width=9 height=9> <a target="body"
+href="attrs.html#exclude">exclude</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#extra_word_characters">extra_word_characters</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#iso_8601">iso_8601</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#logging">logging</a><br>
@@ -158,6 +159,7 @@
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#page_number_text">page_number_text</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#prefix_match_character">prefix_match_character</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#prev_page_text">prev_page_text</a><br>
+ <img src="dot.gif" alt="*" width=9 height=9> <a target="body"
+href="attrs.html#restrict">restrict</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#script_name">script_name</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#search_algorithm">search_algorithm</a><br>
<img src="dot.gif" alt="*" width=9 height=9> <a target="body"
href="attrs.html#search_results_footer">search_results_footer</a><br>
--- htdoc/hts_form.html.orig Thu Feb 17 16:02:22 2000
+++ htdoc/hts_form.html Thu May 31 07:44:44 2001
@@ -49,9 +49,13 @@
<b>exclude</b>
</dt>
<dd>
- This value is a pattern that all URLs of the search results
- cannot match.<br>
- The default is blank.
+ This value is a pattern that specifies which URLs are to be
+ excluded from the search results. If a URL matches one of
+ these patterns it is discarded. Multiple patterns can be
+ given, separated by a bar ("|"), or multiple definitions
+ of the exclude input parameter can be given.<br>
+ The default is specified by the <i>exclude</i>
+ attribute in the configuration file.
</dd>
<dt>
<b>format</b>
@@ -118,11 +122,18 @@
<b>restrict</b>
</dt>
<dd>
- This value is a pattern that all URLs of the search results
- will have to match. This can be used to restrict the search
- to a particular subtree or subsection of a bigger
- database.<br>
- The default is blank.
+ This value is a pattern that all URLs of the search results
+ will have to match. This can be used to restrict the search
+ to a particular subtree or subsection of a bigger database.
+ Multiple patterns can be given, separated by a bar ("|"), or
+ multiple definitions of the restrict input parameter can be
+ given. Any URL in the search results will have to match at
+ least one of these patterns.<br>
+ Note that the restrict list does not take precedence over the
+ exclude list - if a URL matches patterns in both lists it is
+ still excluded from the search results.<br>
+ The default is specified by the <i>restrict</i>
+ attribute in the configuration file.
</dd>
<dt>
<b>sort</b>
--- htsearch/htsearch.cc.orig Tue Feb 15 16:17:13 2000
+++ htsearch/htsearch.cc Thu May 31 07:44:44 2001
@@ -104,24 +104,6 @@ main(int ac, char **av)
cgi input(optind < ac ? av[optind] : none);
//
- // Compile the URL limit pattern.
- //
- if (input.exists("restrict"))
- {
- char *sep = input["restrict"];
- while ((sep = strchr(sep, '\001')) != NULL)
- *sep++ = '|';
- limit_to.Pattern(input["restrict"]);
- }
- if (input.exists("exclude"))
- {
- char *sep = input["exclude"];
- while ((sep = strchr(sep, '\001')) != NULL)
- *sep++ = '|';
- exclude_these.Pattern(input["exclude"]);
- }
-
- //
// Setup the configuration database. First we read the compiled defaults.
// Then we override those with defaults read in from the configuration
// file, and finally we override some attributes with information we
@@ -189,6 +171,35 @@ main(int ac, char **av)
config.Add(form_vars[i], input[form_vars[i]]);
}
+ //
+ // Compile the URL limit pattern.
+ //
+
+ StringList urllist;
+ String urlpat;
+
+ if (strlen(config["restrict"]))
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ urllist.Create(config["restrict"], "| \t\r\n\001");
+ urlpat = urllist.Join('|');
+ urllist.Release(); // release the temporary list of URLs
+ config.Add("restrict", urlpat); // re-create the config attribute
+ limit_to.Pattern(urlpat); // Set the new limit pattern
+ }
+
+ if (strlen(config["exclude"]))
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ urllist.Create(config["exclude"], "| \t\r\n\001");
+ urlpat = urllist.Join('|');
+ urllist.Release(); // release the temporary list of URLs
+ config.Add("exclude", urlpat); // re-create the config attribute
+ exclude_these.Pattern(urlpat);
+ }
+
// Ctype-like functions for what constitutes a word.
HtWordType::Initialize(config);
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
_______________________________________________
htdig-general mailing list <[EMAIL PROTECTED]>
To unsubscribe, send a message to <[EMAIL PROTECTED]> with a
subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html