Tkil <[EMAIL PROTECTED]> writes:

> 3. it doesn't actually replace the URLs "in place"; you can use the
>    idea from "Bernie Aua" <[EMAIL PROTECTED]> to do a massive search
>    and replace after you have the mapping from relative -> absolute
>    URL, but that could be tricky in limited cases:
> 
>       we have some pictures <a href="pictures">here</a>.
> 
>    you need a bit of intelligence to make sure you only replace the
>    appropriate strings.

Take a look at the eg/hrefsub script that comes with
HTML::Parser-3.0x.  It can do exactly this, and has none of the
problems that a final RE-substitution as suggested above has.

Regards,
Gisle


The script goes like this:
--------------------------------------------------------------------------------
#!/usr/bin/perl

# Perform transformations on link attributes in an HTML document.
# Examples:
#
#  $ hrefsub 's/foo/bar/g' index.html
#  $ hrefsub 'URI->new_abs($_, "http://foo")' index.html
#

use strict;
use HTML::Parser ();
use URI;

my %link_attr;
{
    # Set up %link_attr by stealing information from HTML::LinkExtor
    require HTML::LinkExtor;
    while (my($k,$v) = each %HTML::LinkExtor::LINK_ELEMENT) {
        if (ref($v)) {
            $v = { map {$_ => 1} @$v };
        }
        else {
            $v = { $v => 1};
        }
        $link_attr{$k} = $v;
    }
}

my $code = shift;
my $code = 'sub edit { local $_ = shift; my($attr, $tag) = @_; no strict; ' .
           $code .
           '; $_; }';
#print $code;
eval $code;
die $@ if $@;


my $p = HTML::Parser->new(api_version => 3);
$p->handler(default => sub { print @_ }, "text");
$p->handler(start => sub {
                my($tagname, $pos, $text) = @_;
                if (my $link_attr = $link_attr{$tagname}) {
                    while (4 <= @$pos) {
                        # use attribute sets from right to left
                        # to avoid invalidating the offsets
                        # when replacing the values
                        my($k_offset, $k_len, $v_offset, $v_len) =
                            splice(@$pos, -4);
                        my $attrname = lc(substr($text, $k_offset, $k_len));
                        next unless $link_attr->{$attrname};
                        next unless $v_offset; # 0 v_offset means no value
                        my $v = substr($text, $v_offset, $v_len);
                        $v =~ s/^([\'\"])(.*)\1$/$2/;
                        my $new_v = edit($v, $attrname, $tagname);
                        next if $new_v eq $v;
                        $new_v =~ s/\"/&quot;/g;  # since we quote with ""
                        substr($text, $v_offset, $v_len) = qq("$new_v");
                    }
                }
                print $text;
            },
            "tagname, tokenpos, text");

my $file = shift || usage();
$p->parse_file($file) || die "Can't open file $file: $!\n";

sub usage
{
    my $progname = $0;
    $progname =~ s,^.*/,,;
    die "Usage: $progname <perlexpr> <filename>\n";
}

Reply via email to