>>>>> "Marvin" == Marvin Simkin <[EMAIL PROTECTED]> writes:

Marvin> Not to criticize someone's hard work (OK, I guess I am...) but
Marvin> aren't we barking up the wrong tree here? Shouldn't it be
Marvin> possible to discover everything we need to know about HTML by
Marvin> parsing a DTD? That way as the language changes, software
Marvin> would automatically recognize new tags.  :)

I've been looking at a DTD parser, something written with
Parse::RecDescent that would spit out a "contained-by" "contains"
list, because there are very few "sequence" things in the grammar
(except that the explicit or implicit <BODY> must follow the explicit
or implicit <HEAD>).  The idea would be to point it at a w3.org DTD
and pretty print an HTML doc, or at least generate the missing tags so
that XHTML conversion was easier.

In parallel to that, I wanted to write a direct Parse::RecDescent
program that would do the HTML parsing and missing-tag generation and
pretty printing... and then do a DTD-to-P::RD parser to write the code
automatically once I understood what all the target P::RD constructs
would have to look like.  Here's what I got so far.  It parses lists,
because I thought that was the most interesting thing to model
correctly, thanks to the naturally recursive nature of lists.  Note
how mechanical the element rules would have been to translate from the
DTD once I got the patterns down.

    #!/usr/bin/perl

    use Parse::RecDescent;

    use Data::Dumper;

    $::RD_HINT = 1;
    ## $::RD_TRACE = 20;

    my $grammar = Parse::RecDescent->new( <<'END_OF_GRAMMAR' ) or die "Cannot parse 
grammar";

    ## beginning of DTD
    ## http://www.w3.org/TR/REC-html40/sgml/dtd.html

    _coreattrs: attr['id'] | attr['class'] | attr['style'] | attr['title']

    _attrs: _coreattrs
    # _attrs: _i18n
    # _attrs: _events

    _inline: _PCDATA
    # _inline: _fontstyle
    # _inline: _phrase
    # _inline: _special
    # _inline: _formctrl

    _list: UL
    _list: OL
    _list: DIR
    _list: MENU

    # _block: P
    # _block: _heading
    _block: _list
    # _block: _preformatted
    _block: DL
    # _block: DIV | CENTER | NOSCRIPT | NOFRAMES | BLOCKQUOTE | FORM | ISINDEX
    # _block: TABLE | FIELDSET | ADDRESS

    _flow: _block
    _flow: _inline

    ## each element has three parts:
    ## NAME: element["NAME"] (might be element0 or element00 if optional tags)
    ## NAME_contents: ... contents ...
    ## NAME_attribute: ... match one attribute ...

    DL: element["DL"]
    DL_contents: (DT | DD)(s)
    DL_attribute: _attrs | attr['compact']

    DT: element0["DT"]
    DT_contents: _inline(s?)
    DT_attribute: _attrs

    DD: element0["DD"]
    DD_contents: _flow(s?)
    DD_attribute: _attrs

    OL: element["OL"]
    OL_contents: LI(s)
    OL_attribute: _attrs | attr['type'] | attr['compact'] | attr['start']

    UL: element["UL"]
    UL_contents: LI(s)
    UL_attribute: _attrs | attr['type'] | attr['compact']

    DIR: element["DIR"]
    DIR_contents: LI(s)
    DIR_attribute: _attrs | attr['compact']

    MENU: element["MENU"]
    MENU_contents: LI(s)
    MENU_attribute: _attrs | attr['compact']

    LI: element0["LI"]
    LI_contents: _flow(s?)
    LI_attribute: _attrs | attr['type'] | attr['value']

    HTML: element00["HTML"] /^\Z/ { $item[1] }
    HTML_contents: HEAD BODY { [@item[1,2]] }
    HTML_attribute: { undef } # debug

    HEAD: element00["HEAD"]
    HEAD_contents: { [] } # debug
    HEAD_attribute: { undef } # debug

    BODY: element00["BODY"]
    BODY_contents: _flow(s?)
    BODY_attribute: _attrs | attr['background']

    ## end of DTD

    ## subroutines

    element: # (element-name)
      <matchrule:start>[$arg[0]]
      <matchrule:contents>[$arg[0]]
      <matchrule:end>[$arg[0]]
        { [$arg[0], $item[1], @{$item[2]}] }

    element0: # (element-name)
      <matchrule:start>[$arg[0]]
      <matchrule:contents>[$arg[0]]
      <matchrule:end>[$arg[0]](?)
        { [$arg[0], $item[1], @{$item[2]}] }

    element0e: # (element-name)
      <matchrule:start>[$arg[0]]
        { [$arg[0], $item[1]] }

    element00: # (element-name)
      <matchrule:start>[$arg[0]](?)
      <matchrule:contents>[$arg[0]]
      <matchrule:end>[$arg[0]](?)
        { [$arg[0], @{$item[1]} ? $item[1][0] : {}, @{$item[2]}] }

    start: # (element-name)
      "<" tag[$arg[0]] attributes[$arg[0]] ">"
        { $item[3]; }

    contents: # (element-name)
      <matchrule:$arg[0]_contents>

    end: # (element-name)
      "<" "/" tag[$arg[0]] ">"
        { []; }                 # not used

    tag: # (tag-name)
      /$arg[0]/i

    attributes: # (element-name)
      <matchrule:$arg[0]_attribute>(s?)
      { +{ map { @$_ } @{$item[1]} } }

    attr: # (attribute-key)
      /$arg[0]/i "=" <commit> _CDATA { [$item[1], $item[4]] }
      | /$arg[0]/i { [$item[1], $item[1]] }

    _CDATA: /(".*?"|'.*?'|[-a-zA-Z0-9.]+)/s { $1 }

    _PCDATA: <skip:""> /[^<]+/s

    END_OF_GRAMMAR

    my $result = $grammar->HTML( <<'INPUT' ) or die "Cannot parse INPUT";
    <ul>
      <li type=foo>Hello world.
      <li id="random string">
        <ol>
          <li>Hi there.
          <li>Hey ho
        </ol>
      </li>
      <li id='#123' class=fred>Another item.
    </ul>

    INPUT

    print Dumper($result);


-- 
Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095
<[EMAIL PROTECTED]> <URL:http://www.stonehenge.com/merlyn/>
Perl/Unix/security consulting, Technical writing, Comedy, etc. etc.
See PerlTraining.Stonehenge.com for onsite and open-enrollment Perl training!

Reply via email to