On Wed, 20 Feb 2002, Munazza Bukhari wrote:
>
> Does htdig let you track the search query strings? I
> am looking to improve
> the content of my website so I want to figure out what
> kind of things people
> are searching for?
>
> I can write a miner for my webserver's access log file
> but I was just wondering
> if there was something out there already.
I hacked something like this for my own website. Perhaps you might find
it useful. I'll append to this reply. Since search terms are bilingual
on my website are trilingual (English, Ukrainian and Russian, the latter
two using cp1251 encoding) I setup some arrays (%HEXARRAYS)to
filter/convert search terms which show up in the url as something like
%20%21%23 ... and the like. You can amend or remove reference to the
arrays (salt and pepper to your taste). This might give me some incentive
to make this code a little cleaner and portable.
One thing I noticed about htdig is that the first search is a POST
request; consequently, the initial search term(s) isn't(aren't) in the
logfile. However, subsequent searches are recorded using a GET request
leaving the terms visible in the log entry.
Max Pyziur BRAMA - Gateway Ukraine
[EMAIL PROTECTED] http://www.brama.com/
> Thanks,
> Munazza
>
############ begin script #########################
#!/usr/bin/perl
$ACCESSLOG = $ARGV[0] ;
$SEARCHTERMS = "/tmp/". $$ . "st.txt" ;
$LOWERTHRESH = 40;
$TODAY = `date`;
%HEXARRAY = (
"\%20", " ", "\%21", "\!", "\%22", "\*", "\%23", "\#", "\%24", "\$", "\%25", "\%",
"\%26", "\&", "\%27", "\'", "\%28", "\(",
"\%29", "\)", "\%2A", "\*", "\%2B", "\+", "\%2C", "\,", "\%2D", "\-", "\%2E", "\.",
"\%2F", "\/", "\%30", "0", "\%31", "1",
"\%32", "2", "\%33", "3", "\%34", "4", "\%35", "5", "\%36", "6", "\%37", "7", "\%38",
"8", "\%39", "9", "\%3A", "\:", "\%3B", "\;",
"\%3C", "\<", "\%3D", "\=", "\%3E", "\>", "\%3F", "\?", "\%40", "\@", "\%41", "a",
"\%42", "b", "\%43", "c", "\%44", "d",
"\%45", "e", "\%46", "f", "\%47", "g", "\%48", "h", "\%49", "i", "\%4A", "j", "\%4B",
"k", "\%4C", "l", "\%4D", "m", "\%4E", "n",
"\%4F", "o", "\%50", "p", "\%51", "q", "\%52", "r", "\%53", "s", "\%54", "t", "\%55",
"u", "\%56", "v", "\%57", "w", "\%58", "x",
"\%59", "y", "\%5A", "z", "\%5B", "\[", "\%5C", "\\", "\%5D", "\]", "\%5E", "\^",
"\%5F", "\_", "\%60", "\`", "\%61", "a",
"\%62", "b", "\%63", "c", "\%64", "d", "\%65", "e", "\%66", "f", "\%67", "g", "\%68",
"h", "\%69", "i", "\%6A", "j", "\%6B", "k",
"\%6C", "l", "\%6D", "m", "\%6E", "n", "\%6F", "o", "\%70", "p", "\%71", "q", "\%72",
"r", "\%73", "s", "\%74", "t", "\%75", "u",
"\%76", "v", "\%77", "w", "\%78", "x", "\%79", "y", "\%7A", "z", "\%7B", "{", "\%7C",
"|", "\%7D", "}", "\%7E", "~", "\%7F", " ",
"\%80", " ", "\%81", "�", "\%82", "\�", "\%83", "�", "\%84", "\�", "\%85", "\�",
"\%86", "\�", "\%87", "\�", "\%88", " ",
"\%89", "\�", "\%8A", "�", "\%8B", "\�", "\%8C", "�", "\%8D", "�", "\%8E", " ",
"\%8F", "�", "\%90", "�", "\%91", "\�",
"\%92", "\�", "\%93", "\�", "\%94", "\�", "\%95", "\�", "\%96", "\�", "\%97", "\�",
"\%98", " ", "\%99", "\�", "\%9A", "�",
"\%9B", "\�", "\%9C", "�", "\%9D", "�", "\%9E", " ", "\%9F", "�", "\%A0", "", "\%A1",
"�", "\%A2", "�", "\%A3", " ", "\%A4", "�",
"\%A5", "�", "\%A6", "\�", "\%A7", "�", "\%A8", "�", "\%A9", "�", "\%AA", "�", "\%AB",
"\�", "\%AC", "\�", "\%AD", "\�",
"\%AE", "�", "\%AF", "�", "\%B0", "\�", "\%B1", "�", "\%B2", "�", "\%B3", "�", "\%B4",
"�", "\%B5", "�", "\%B6", "�", "\%B7", "\�",
"\%B8", "�", "\%B9", "�", "\%BA", "�", "\%BB", "\�", "\%BC", "�", "\%BD", "�", "\%BE",
"�", "\%BF", "�", "\%C0", "�", "\%C1", "�",
"\%C2", "�", "\%C3", "�", "\%C4", "�", "\%C5", "�", "\%C6", "�", "\%C7", "�", "\%C8",
"�", "\%C9", "�", "\%CA", "�", "\%CB", "�",
"\%CC", "�", "\%CD", "�", "\%CE", "�", "\%CF", "�", "\%D0", "�", "\%D1", "�", "\%D2",
"�", "\%D3", "�", "\%D4", "�", "\%D5", "�",
"\%D6", "�", "\%D7", "�", "\%D8", "�", "\%D9", "�", "\%DA", "�", "\%DB", "�", "\%DC",
"�", "\%DD", "�", "\%DE", "�", "\%DF", "�",
"\%E0", "�", "\%E1", "�", "\%E2", "�", "\%E3", "�", "\%E4", "�", "\%E5", "�", "\%E6",
"�", "\%E7", "�", "\%E8", "�", "\%E9", "�",
"\%EA", "�", "\%EB", "�", "\%EC", "�", "\%ED", "�", "\%EE", "�", "\%EF", "�", "\%F0",
"�", "\%F1", "�", "\%F2", "�", "\%F3", "�",
"\%F4", "�", "\%F5", "�", "\%F6", "�", "\%F7", "�", "\%F8", "�", "\%F9", "�", "\%FA",
"�", "\%FB", "�", "\%FC", "�", "\%FD", "�",
"\%FE", "�", "\%FF", "�",
"\%20", " ", "\%21", "\!", "\%22", "\*", "\%23", "\#", "\%24", "\$", "\%25", "\%",
"\%26", "\&", "\%27", "\'", "\%28", "\(",
"\%29", "\)", "\%2a", "\*", "\%2b", "\+", "\%2c", "\,", "\%2d", "\-", "\%2e", "\.",
"\%2f", "\/", "\%30", "0", "\%31", "1",
"\%32", "2", "\%33", "3", "\%34", "4", "\%35", "5", "\%36", "6", "\%37", "7", "\%38",
"8", "\%39", "9", "\%3a", "\:", "\%3b", "\;",
"\%3c", "\<", "\%3d", "\=", "\%3e", "\>", "\%3f", "\?", "\%40", "\@", "\%41", "a",
"\%42", "b", "\%43", "c", "\%44", "d",
"\%45", "e", "\%46", "f", "\%47", "g", "\%48", "h", "\%49", "i", "\%4a", "j", "\%4b",
"k", "\%4c", "l", "\%4d", "m", "\%4e", "n",
"\%4f", "o", "\%50", "p", "\%51", "q", "\%52", "r", "\%53", "s", "\%54", "t", "\%55",
"u", "\%56", "v", "\%57", "w", "\%58", "x",
"\%59", "y", "\%5a", "z", "\%5b", "\[", "\%5c", "\\", "\%5d", "\]", "\%5e", "\^",
"\%5f", "\_", "\%60", "\`", "\%61", "a",
"\%62", "b", "\%63", "c", "\%64", "d", "\%65", "e", "\%66", "f", "\%67", "g", "\%68",
"h", "\%69", "i", "\%6a", "j", "\%6b", "k",
"\%6c", "l", "\%6d", "m", "\%6e", "n", "\%6f", "o", "\%70", "p", "\%71", "q", "\%72",
"r", "\%73", "s", "\%74", "t", "\%75", "u",
"\%76", "v", "\%77", "w", "\%78", "x", "\%79", "y", "\%7a", "z", "\%7b", "{", "\%7c",
"|", "\%7d", "}", "\%7e", "~", "\%7f", " ",
"\%80", " ", "\%81", "�", "\%82", "\�", "\%83", "�", "\%84", "\�", "\%85", "\�",
"\%86", "\�", "\%87", "\�", "\%88", " ",
"\%89", "\�", "\%8a", "�", "\%8b", "\�", "\%8c", "�", "\%8d", "�", "\%8e", " ",
"\%8f", "�", "\%90", "�", "\%91", "\'",
"\%92", "\'", "\%93", "\"", "\%94", "\"", "\%95", "\o", "\%96", "\-", "\%97", "\-",
"\%98", " ", "\%99", "\�", "\%9a", "�",
"\%9b", "\�", "\%9c", "�", "\%9d", "�", "\%9e", " ", "\%9f", "�", "\%a0", "", "\%a1",
"�", "\%a2", "�", "\%a3", " ", "\%a4", "�",
"\%a5", "�", "\%a6", "\�", "\%a7", "�", "\%a8", "�", "\%a9", "�", "\%aa", "�", "\%ab",
"\�", "\%ac", "\ ", "\%ad", "\�",
"\%ae", "�", "\%af", "�", "\%b0", "\�", "\%b1", "�", "\%b2", "�", "\%b3", "�", "\%b4",
"�", "\%b5", "�", "\%b6", "", "\%b7", "\�",
"\%b8", "�", "\%b9", "�", "\%ba", "�", "\%bb", "\�", "\%bc", "�", "\%bd", "�", "\%be",
"�", "\%bf", "�", "\%c0", "�", "\%c1", "�",
"\%c2", "�", "\%c3", "�", "\%c4", "�", "\%c5", "�", "\%c6", "�", "\%c7", "�", "\%c8",
"�", "\%c9", "�", "\%ca", "�", "\%cb", "�",
"\%cc", "�", "\%cd", "�", "\%ce", "�", "\%cf", "�", "\%d0", "�", "\%d1", "�", "\%d2",
"�", "\%d3", "�", "\%d4", "�", "\%d5", "�",
"\%d6", "�", "\%d7", "�", "\%d8", "�", "\%d9", "�", "\%da", "�", "\%db", "�", "\%dc",
"�", "\%dd", "�", "\%de", "�", "\%df", "�",
"\%e0", "�", "\%e1", "�", "\%e2", "�", "\%e3", "�", "\%e4", "�", "\%e5", "�", "\%e6",
"�", "\%e7", "�", "\%e8", "�", "\%e9", "�",
"\%ea", "�", "\%eb", "�", "\%ec", "�", "\%ed", "�", "\%ee", "�", "\%ef", "�", "\%f0",
"�", "\%f1", "�", "\%f2", "�", "\%f3", "�",
"\%f4", "�", "\%f5", "�", "\%f6", "�", "\%f7", "�", "\%f8", "�", "\%f9", "�", "\%fa",
"�", "\%fb", "�", "\%fc", "�", "\%fd", "�",
"\%fe", "�", "\%ff", "�",
"�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�",
"�", "�",
"�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�",
"�", "�",
"�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�",
"�", "�",
"�", "�", "�", "�", "�", "�", "�", "�", "�", "�", "�",
"A", "a", "B", "b", "C", "c", "D", "d", "E", "e", "F", "f", "G", "g", "H", "h", "I",
"i", "J",
"j", "K", "k", "L", "l", "M", "m", "N", "n", "O", "o", "P", "p", "Q", "q", "R", "r",
"S", "s",
"T", "t", "U", "u", "V", "v", "W", "w", "X", "x", "Y", "y", "Z", "z"
) ;
@HEXKEYS = keys(%HEXARRAY) ;
@HEXVALUES = values(%HEXARRAY);
getterms() ;
prepterms() ;
printdata() ;
sub getterms() {
open(ACCESSLOG, "egrep \'GET /cgi-bin/htsearch\' $ACCESSLOG | ");
while (<ACCESSLOG>) {
chomp ;
($host, $rfc931, $authuser, $timestamp, $request, $status, $bytes) =
/^(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^"]*)\" (\S+) (\S+)/;
$request =~ s/GET.+words\=//g ;
$request =~ s/\;page.+HTTP.+//g ;
$request =~ s/ HTTP.+//g ;
$request =~ s/\+/ /g ;
for ($i=0; $i<$#HEXKEYS; $i++) {
$request =~ s/$HEXKEYS[$i]/$HEXVALUES[$i]/g ;
}
$request =~ s/\*/ /g ;
$request =~ s/^ //g ;
$request =~ s/^ //g ;
$request =~ s/^ //g ;
$request =~ s/^\t //g ;
$request =~ s/get \/cgi-bin\/htsearch//g ;
$request =~ s/^\+//g ;
if ($request ne "") {
$SEARCHTERM = $request ;
$SEARCHTERMREQUESTS{$SEARCHTERM}++;
}
}
close(ACCESSLOG) ;
}
sub prepterms() {
@SEARCHTERMS = sort(keys(%SEARCHTERMREQUESTS));
# @SEARCHTERMREQUESTS = sort(values(%SEARCHTERMREQUESTS));
open(ST, ">$SEARCHTERMS") ;
foreach $SEARCHTERM (@SEARCHTERMS) {
printf ST "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM
;
}
close(ST);
open(ST, "$SEARCHTERMS") ;
@ST = <ST> ;
@RST = reverse(sort(@ST));
close(ST);
# foreach $SEARCHTERMREQUEST (@SEARCHTERMREQUESTS) {
# printf ST "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ;
# }
# foreach $SEARCHTERM (@SEARCHTERMS) {
# printf "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ;
# }
# foreach $SEARCHTERMREQUEST (@SEARCHTERMREQUESTS) {
# printf "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ;
# }
}
# sub by_mostly_numeric() { ($a <=> $b || ($a cmp $b); }
sub printdata() {
print "<HTML><HEAD>\n";
print "<TITLE>BRAMA's Search Engine Search Terms </TITLE>\n";
print "<meta http-equiv=Content-Type content=text/html;
charset=windows-1251>\n";
print "</HEAD><BODY>\n";
print "<H1>BRAMA' Search Engine Search Terms</H1>\n";
print "<EM>Report Run $TODAY</EM><p>\n";
print "<b>Most Popular Search Terms<b>\n";
print "<pre>";
for ($i=1; $i<$#RST; $i++) {
print "\t$RST[$i]";
}
print "</pre><br><br>";
print "<pre>";
print "<b>Alphabetized Search Terms<b>\n";
foreach $SEARCHTERM (@SEARCHTERMS) {
printf "\t%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM
;
}
print "</BODY></HTML>\n";
}
sub deltmpfiles() {
unlink $SEARCHTERMS;
}
_______________________________________________
htdig-general mailing list <[EMAIL PROTECTED]>
To unsubscribe, send a message to <[EMAIL PROTECTED]> with a
subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html