hartill     98/01/07 09:29:01

  Added:       src/support logresolve.pl
  Log:
  Adding logresolve.pl  -  a Perl version of logresolve.c that spawns lots
  of low overhead processes to map IP numbers into hostnames concurrently
  instead of one by one at a snails pace (logresolve.c).
  
  Nobody said it was shite so here it is.
  
  Submitted by: Rob Hartill
  
  Revision  Changes    Path
  1.1                  apachen/src/support/logresolve.pl
  
  Index: logresolve.pl
  ===================================================================
  #!/usr/local/bin/perl
  
  # ====================================================================
  # Copyright (c) 1995-1998 The Apache Group.  All rights reserved.
  #
  # Redistribution and use in source and binary forms, with or without
  # modification, are permitted provided that the following conditions
  # are met:
  #
  # 1. Redistributions of source code must retain the above copyright
  #    notice, this list of conditions and the following disclaimer. 
  #
  # 2. Redistributions in binary form must reproduce the above copyright
  #    notice, this list of conditions and the following disclaimer in
  #    the documentation and/or other materials provided with the
  #    distribution.
  #
  # 3. All advertising materials mentioning features or use of this
  #    software must display the following acknowledgment:
  #    "This product includes software developed by the Apache Group
  #    for use in the Apache HTTP server project (http://www.apache.org/)."
  #
  # 4. The names "Apache Server" and "Apache Group" must not be used to
  #    endorse or promote products derived from this software without
  #    prior written permission. For written permission, please contact
  #    [EMAIL PROTECTED]
  #
  # 5. Redistributions of any form whatsoever must retain the following
  #    acknowledgment:
  #    "This product includes software developed by the Apache Group
  #    for use in the Apache HTTP server project (http://www.apache.org/)."
  #
  # THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY
  # EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE GROUP OR
  # ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  # OF THE POSSIBILITY OF SUCH DAMAGE.
  # ====================================================================
  #
  # This software consists of voluntary contributions made by many
  # individuals on behalf of the Apache Group and was originally based
  # on public domain software written at the National Center for
  # Supercomputing Applications, University of Illinois, Urbana-Champaign.
  # For more information on the Apache Group and the Apache HTTP server
  # project, please see <http://www.apache.org/>.
  
  # logresolve.pl
  #
  # v 1.0 by robh @ imdb.com
  # 
  # usage: logresolve.pl <infile >outfile
  #
  # input = Apache/NCSA/.. logfile with IP numbers at start of lines
  # output = same logfile with IP addresses resolved to hostnames where
  #  name lookups succeeded.
  #
  # this differs from the C based 'logresolve' in that this script
  # spawns a number ($CHILDREN) of subprocesses to resolve addresses
  # concurrently and sets a short timeout ($TIMEOUT) for each lookup in
  # order to keep things moving quickly.
  #
  # the parent process handles caching of IP->hostnames using a Perl hash
  # it also avoids sending the same IP to multiple child processes to be
  # resolved multiple times concurrently.
  #
  # Depending on the settings of $CHILDREN and $TIMEOUT you should see
  # significant reductions in the overall time taken to resolve your
  # logfiles. With $CHILDREN=40 and $TIMEOUT=5 I've seen 200,000 - 300,000
  # logfile lines processed per hour compared to ~45,000 per hour
  # with 'logresolve'.
  #
  # I haven't yet seen any noticable reduction in the percentage of IPs
  # that fail to get resolved. Your mileage will no doubt vary. 5s is long
  # enough to wait IMO.
  
  $|=1;
  
  use FileHandle;
  use Socket;
  
  use strict;
  no strict 'refs';
  
  use vars qw($AF_UNIX $SOCK_STREAM $PROTOCOL);
  ($AF_UNIX, $SOCK_STREAM, $PROTOCOL) = (1, 1, 0);
  
  my $CHILDREN = 40;
  my $TIMEOUT  = 5;
  
  my $filename;
  my %hash = ();
  my $parent = $$;
  
  my @children = ();
  for (my $child = 1; $child <=$CHILDREN; $child++) {
        my $f = fork(); 
        if (!$f) {
                $filename = "./.socket.$parent.$child";
                if (-e $filename) { unlink($filename) || warn "$filename .. 
$!\n";}
                &child($child);
                exit(0);
        }
        push(@children, $f);
  }
  
  &parent;
  &cleanup;
  
  ## remove all temporary files before shutting down
  sub cleanup {
         # die kiddies, die
        kill(15, @children);
        for (my $child = 1; $child <=$CHILDREN; $child++) {
                if (-e "./.socket.$parent.$child") {
                        unlink("./.socket.$parent.$child")
                                || warn ".socket.$parent.$child $!";
                }
        }
  }
        
  sub parent {
        # Trap some possible signals to trigger temp file cleanup
        $SIG{'KILL'} = $SIG{'INT'} = $SIG{'PIPE'} = \&cleanup;
  
        my %CHILDSOCK;
        my $filename;
   
         ## fork child processes. Each child will create a socket connection
         ## to this parent and use an unique temp filename to do so.
        for (my $child = 1; $child <=$CHILDREN; $child++) {
                $CHILDSOCK{$child}= FileHandle->new;
  
                if (!socket($CHILDSOCK{$child}, $AF_UNIX, $SOCK_STREAM, 
$PROTOCOL)) {
                        warn "parent socket to child failed $!";
                }
                $filename = "./.socket.$parent.$child";
                my $response;
                do {
                        $response = connect($CHILDSOCK{$child}, "$filename");
                        if ($response != 1) {
                                sleep(1);
                        }                       
                } while ($response != 1);
                $CHILDSOCK{$child}->autoflush;
        }
        ## All child processes should now be ready or at worst warming up 
  
        my (@buffer, $child, $ip, $rest, $hostname, $response);
         ## read the logfile lines from STDIN
        while(<STDIN>) {
                @buffer = ();   # empty the logfile line buffer array.
                $child = 1;             # children are numbered 1..N, start 
with #1
  
                # while we have a child to talk to and data to give it..
                do {
                        push(@buffer, $_);                                      
# buffer the line
                        ($ip, $rest) = split(/ /, $_, 2);       # separate IP 
form rest
  
                        unless ($hash{$ip}) {                           # 
resolve if unseen IP
                                $CHILDSOCK{$child}->print("$ip\n"); # pass IP 
to next child
                                $hash{$ip} = $ip;                               
# don't look it up again.
                                $child++;
                        }
                } while (($child < ($CHILDREN-1)) and ($_ = <STDIN>));
  
                 ## now poll each child for a response
                while (--$child > 0) { 
                        $response = $CHILDSOCK{$child}->getline;
                        chomp($response);
                         # child sends us back both the IP and HOSTNAME, no 
need for us
                         # to remember what child received any given IP, and no 
worries
                         # what order we talk to the children
                        ($ip, $hostname) = split(/\|/, $response, 2);
                        $hash{$ip} = $hostname;
                }
  
                 # resolve all the logfiles lines held in the log buffer array..
                for (my $line = 0; $line <=$#buffer; $line++) {
                         # get next buffered line
                        ($ip, $rest) = split(/ /, $buffer[$line], 2);
                         # separate IP from rest and replace with cached 
hostname
                        printf STDOUT ("%s %s", $hash{$ip}, $rest);
                }
        }
  }
  
  ########################################
  
  sub child {
         # arg = numeric ID - how the parent refers to me
        my $me = shift;
  
         # add trap for alarm signals.
        $SIG{'ALRM'} = sub { die "alarmed"; };
  
         # create a socket to communicate with parent
        socket(INBOUND, $AF_UNIX, $SOCK_STREAM, $PROTOCOL)
                || die "Error with Socket: !$\n";
        $filename = "./.socket.$parent.$me";
        bind(INBOUND, $filename) || die "Error Binding $filename: $!\n";
        listen(INBOUND, 5) || die "Error Listening: $!\n";
  
        my ($ip, $send_back);
        my $talk = FileHandle->new;
  
         # accept a connection from the parent process. We only ever have
         # have one connection where we exchange 1 line of info with the
         # parent.. 1 line in (IP address), 1 line out (IP + hostname).
        accept($talk, INBOUND) || die "Error Accepting: $!\n";
         # disable I/O buffering just in case
        $talk->autoflush;
         # while the parent keeps sending data, we keep responding..
        while(($ip = $talk->getline)) {
                chomp($ip);
                 # resolve the IP if time permits and send back what we found..
                $send_back = sprintf("%s|%s", $ip, &nslookup($ip));
                $talk->print($send_back."\n");
        }
  }
  
  # perform a time restricted hostname lookup.
  sub nslookup {
         # get the IP as an arg
        my $ip = shift;
        my $hostname = undef;
  
         # do the hostname lookup inside an eval. The eval will use the
         # already configured SIGnal handler and drop out of the {} block
         # regardless of whether the alarm occured or not.
        eval {
                alarm($TIMEOUT);
                $hostname = gethostbyaddr(gethostbyname($ip), AF_INET);
                alarm(0);
        };
        if ($@ =~ /alarm/) {
                 # useful for debugging perhaps..
                # print "alarming, isn't it? ($ip)";
        }
  
         # return the hostname or the IP address itself if there is no hostname
        $hostname ne "" ? $hostname : $ip;
  }
  
  
  
  
  

Reply via email to