If you'll want a small app that fetches only html files from a url, then
try this out. Also, help me in testing and debugging.
Philip
--
"my terminal is a lethal teaspoon."
-- Patricia O Tuama
#!/usr/bin/perl -w
dienicely("Usage: $0 <URL>", 1) unless @ARGV;
use strict;
use Socket qw(:DEFAULT :crlf);
my $useragent = "Search_Spider/0.1";
my ($iaddr, $paddr, $proto, $line);
my ($host, $port, $remotename) = getnexthost();
$iaddr = inet_aton($host)
or dienicely("Hostname lookup failure: $host");
$paddr = sockaddr_in($port, $iaddr);
$proto = getprotobyname('tcp');
socket(SOCK, PF_INET, SOCK_STREAM, $proto)
or dienicely("Can't open socket on local machine");
connect(SOCK, $paddr)
or dienicely("Can't connect socket on $host:$port");
select SOCK; $|=1; select STDOUT;
print SOCK "GET $remotename HTTP/1.0$CRLF";
print SOCK "Host: $host:$port$CRLF";
print SOCK "User-Agent: $useragent$CRLF";
print SOCK "$CRLF"; # End with a blank line.
$_=<SOCK>;
s/[\r\n]+$//;
dienicely("Not an HTTP host: $ARGV[0]", 1) unless m#^HTTP/\d+\.\d+#;
my $header="$_\n";
while(($_=<SOCK>) !~ /^($CRLF|$CR|$LF)$/) {
s/[\r\n]+$//;
$header .= "$_\n";
}
$_=$header;
my $msg = $1 if /^(Location:\s+.+)$/m;
if(m#^HTTP/\d+\.\d+\s+(\d{3})\s+(.*)$#m) {
dienicely($2 || $1, 1) if $1 =~ /^[45]\d{2}/;
dienicely($msg, 1) if($1 =~ /^30\d/);
} else {
dienicely("Not an HTTP host: $ARGV[0]", 1);
}
$/=undef;
$_=<SOCK>;
chomp;
print;
exit;
sub getnexthost() {
my($host, $port, $file) = ("", 80, "");
dienicely("Unrecognised URI: $ARGV[0]", 1)
unless $ARGV[0] =~ m#^http://[-\w.]+(:\d*)?#i;
if($ARGV[0] =~ m#^(http://)([-\w.]+)(:\d*)?((/[^/]*)*)(\#.*)?#i) {
($host, $port, $file) = ($2, $3, $4);
$port =~ s/:// if defined($port);
$port = 80 unless $port;
$port = getservbyname($port, 'tcp') || 80 if $port =~ /\D/;
$file = "/" unless $file;
}
dienicely("Unrecognised URI: $ARGV[0]", 1)
unless $host && $port && $file;
return($host, $port, $file);
}
sub dienicely {
my $message = shift || "Unknown error";
my $noerror = shift;
$message .= ": $!" if !$noerror;
$message .= "\n";
print STDERR "Error: $message";
exit(1);
}
sub warnnicely {
my $message = shift || "Unknown warning";
my $noerror = shift;
$message .= ": $!" if !$noerror;
$message .= "\n";
print STDERR "Warning: $message";
}