I've created a module that uses HTML::Parser to parse some HTML and create a tree structure. Someone had suggested to use HTML::TreeBuilder, but my HTML contains HTML::Mason code embedded, and HTML::TreeBuilder doesn't handle that well at all. HTML::TreeBuilder also adds <body>, <head>, and <html> tags when there aren't any in the document it is parsing. The files I'm using this with are only parts of HTML pages, so I don't want that stuff added.

My module works well enough, but I'm getting to the point where I need multiple parse trees existing at the same time in a mod_perl environment. The way my module is now, they could get mixed up, because I can't find a way to pass a custom variable to the event handler subroutines of HTML::Parser.

I've figured that if I subclass it, I can create a new object for each parse tree instead of just returning an array reference. Here is my current code:

package SkylineEdit;

use HTML::Parser ();

@ISA = ('Exporter');
@EXPORT = ('html_to_htmltree', 'htmltree_to_html', 'get_node_content', 'set_node_content');


my $htmltree;
my $node;
my @prevnodes;
my $htmloutput;

sub start {
  my $tagname = shift;
  my $attr = shift;
  my $newnode = {};

  $newnode->{tag} = $tagname;
  foreach my $key(keys %{$attr}) {
    $newnode->{$key} = $attr->{$key};
  }
  $newnode->{content} = [];
  push @prevnodes, $node;
  push @{$node}, $newnode;
  $node = $newnode->{content};
}

sub end {
  my $tagname = shift;

  $node = pop @prevnodes;
}

sub text {
  my $text = shift;

  chomp $text;
#  $text =~ s/(^\n|\n$)//gs;
  if($text ne '') {
    push @{$node}, $text;
  }
}

sub set_node_content {
  my $htmltree = shift;
  my $node = shift;
  my $content = shift;
  my $tmpnode = $htmltree->[0];

  $node =~ s/^\d+\.//;
  while($node =~ /(\d+)\.?/cg) {
    $tmpnode = $tmpnode->{content}->[$1];
  }
  $tmpnode->{content} = [$content];

  return $htmltree;
}

sub get_node_content {
  my $htmltree = shift;
  my $node = shift;
  my $levels = shift || 0;
  my $tmpnode = $htmltree->[0];

  $node =~ s/^\d+\.//;
  while($node =~ /(\d+)\.?/cg) {
    $tmpnode = $tmpnode->{content}->[$1];
  }
  descend_htmltree($tmpnode->{content}, 0, "");

  return $htmloutput;
}

sub descend_htmltree {
  my $node = shift;
  my $withclickiness = shift;
  my $node_id = shift;
  my $colors = { td => '#ff0000', p => '#aaaaaa', table => '#ff0000' };

my $node_counter = 0;
foreach my $tmpnode (@{$node}) {
if(ref($tmpnode) eq 'HASH') {
my $nodeid = "${node_id}.${node_counter}";
$htmloutput .= "<div style='border: thin solid " . $colors->{$tmpnode->{tag}} . "; margin: 1px 1px 1px 1px'>" if($withclickiness && $tmpnode->{tag} eq 'table');
$htmloutput .= "<$tmpnode->{tag}";
foreach(keys %{$tmpnode}) {
$htmloutput .= " $_=\"$tmpnode->{$_}\"" if($_ ne 'tag' && $_ ne 'content');
}
$htmloutput .= ">";
$htmloutput .= "<div style='padding: 1px 1px 1px 1px; border: thin solid " . $colors->{$tmpnode->{tag}} . "; margin: 1px 1px 1px 1px' onDblClick=\"parent.location = '/editor/editfile.html?action=edittext&node=${nodeid}&tmpfile='+tmpfile+'&filename='+filename\">" if($withclickiness && ($tmpnode->{tag} eq 'p' || $tmpnode->{tag} eq 'td'));
descend_htmltree($tmpnode->{content}, $withclickiness, $nodeid);
$htmloutput .= "</div>" if($withclickiness && ($tmpnode->{tag} eq 'p' || $tmpnode->{tag} eq 'td'));
$htmloutput .= "</$tmpnode->{tag}>" if($tmpnode->{tag} ne 'br');
$htmloutput .= "</div>" if($withclickiness && $tmpnode->{tag} eq 'table');
} else {
# my $nodeid = "${node_id}.${node_counter}";
$htmloutput .= "$tmpnode";
}
$node_counter++;
}
}


sub htmltree_to_html {
  my $filename = shift || '';
  my $withclickiness = shift || 0;
  my $htmltree = shift;

  descend_htmltree($htmltree->[0]->{content}, $withclickiness, "0");
  if($filename ne '') {
    open HTML, "> $filename" or die "Can't open $filename for HTML output";
    print HTML $htmloutput;
    close HTML;
  }

  return $htmloutput;
}

sub html_to_htmltree {
  my $filename = shift;
  my $html = shift || '';
#  my $rightpane = shift || 0;
#  my $htmltree;

  $htmltree = [ { tag => 'document', content => [] } ];
  $node = $htmltree->[0]->{content};
  @prevnodes = ($htmltree);
  $htmloutput = "";
  my $p = HTML::Parser->new( api_version => 3,
                             start_h => [\&start, "tagname, attr"],
                             end_h   => [\&end,   "tagname"],
                             text_h  => [\&text,  "dtext"] );
  if($filename ne '') {
    open HTML, "< $filename" or die "Can't open input HTML file";
    $html = "";
    while(<HTML>) {
      $html .= $_;
    }
    close HTML;
#    $html =~ s|(</?)%(\w+?>)|${1}_${2}|sg;
  }
  return undef if($html =~ /<\%\w+?>/s);
  $p->parse($html);
  $p->eof;

  return $htmltree;
}

1;

What changes do I need to make so that I can do something like the following? Thanks for any help.

use SkylineEdit;

my $htmltree = SkylineEdit->new;
$htmltree->html_to_htmltree($somefile);

--
Andrew Gaffney
Network Administrator
Skyline Aeronautics, LLC.
636-357-1548


-- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED] <http://learn.perl.org/> <http://learn.perl.org/first-response>




Reply via email to