If you'll want a small app that fetches only html files from a url, then
try this out.  Also, help me in testing and debugging.

Philip

-- 
"my terminal is a lethal teaspoon."
-- Patricia O Tuama
#!/usr/bin/perl -w

dienicely("Usage: $0 <URL>", 1) unless @ARGV;

use strict;
use Socket qw(:DEFAULT :crlf);

my $useragent = "Search_Spider/0.1";

my ($iaddr, $paddr, $proto, $line);

my ($host, $port, $remotename) = getnexthost();

$iaddr = inet_aton($host) 
        or dienicely("Hostname lookup failure: $host");
$paddr = sockaddr_in($port, $iaddr);
$proto = getprotobyname('tcp');

socket(SOCK, PF_INET, SOCK_STREAM, $proto) 
        or dienicely("Can't open socket on local machine");
connect(SOCK, $paddr) 
        or dienicely("Can't connect socket on $host:$port");

select SOCK;  $|=1;  select STDOUT;

print SOCK "GET $remotename HTTP/1.0$CRLF";
print SOCK "Host: $host:$port$CRLF";
print SOCK "User-Agent: $useragent$CRLF";
print SOCK "$CRLF";      # End with a blank line.

$_=<SOCK>;
s/[\r\n]+$//;
dienicely("Not an HTTP host: $ARGV[0]", 1) unless m#^HTTP/\d+\.\d+#;
my $header="$_\n";

while(($_=<SOCK>) !~ /^($CRLF|$CR|$LF)$/) {
        s/[\r\n]+$//;
        $header .= "$_\n";
}

$_=$header;

my $msg = $1 if /^(Location:\s+.+)$/m;

if(m#^HTTP/\d+\.\d+\s+(\d{3})\s+(.*)$#m) {
        dienicely($2 || $1, 1) if $1 =~ /^[45]\d{2}/;
        dienicely($msg, 1) if($1 =~ /^30\d/);
} else {
        dienicely("Not an HTTP host: $ARGV[0]", 1);
}

$/=undef;
$_=<SOCK>;
chomp;
print;

exit;

sub getnexthost() {
        my($host, $port, $file) = ("", 80, "");

        dienicely("Unrecognised URI: $ARGV[0]", 1) 
                unless $ARGV[0] =~ m#^http://[-\w.]+(:\d*)?#i;

        if($ARGV[0] =~ m#^(http://)([-\w.]+)(:\d*)?((/[^/]*)*)(\#.*)?#i) {
                ($host, $port, $file) = ($2, $3, $4);
                $port =~ s/:// if defined($port);
                $port = 80 unless $port;
                $port = getservbyname($port, 'tcp') || 80 if $port =~ /\D/; 
                $file = "/" unless $file;
        }
        
        dienicely("Unrecognised URI: $ARGV[0]", 1) 
                unless $host && $port && $file;

        return($host, $port, $file);
}

sub dienicely {
        my $message = shift || "Unknown error";
        my $noerror = shift;
        $message .= ": $!" if !$noerror;
        $message .= "\n";
        print STDERR "Error: $message";
        exit(1);
}

sub warnnicely {
        my $message = shift || "Unknown warning";
        my $noerror = shift;
        $message .= ": $!" if !$noerror;
        $message .= "\n";
        print STDERR "Warning: $message";
}

Reply via email to