www.m-w.com.

Zeus Odin Mon, 07 Jul 2003 19:17:52 -0700

I have written an interface for m-w.com. I found some scripts on the web but
nothing really robust. Please have a look, make comments, request
functionality, make suggestions, make changes, or anything else you feel
useful. If I have enough time, I will make this fully object-oriented.


I welcome all comments.

Thanks,
Michael

--------------begin mw.pl-----------------------------
#!/usr/bin/perl
use warnings;
use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw(POST);
use Getopt::Std;

# OPTIONS:
# 1. retain tabs            a
# 2. double spaced          d
# 3. help                   h
# 4. divider btween words   i
# 5. wrap text              r
# 6. thesaurus              t
# 7. ugly formatting        u
# 8. verbose or recursive   v
# 9. set width              w

local $SIG{__WARN__} = sub {
  my $msg = shift;
  die $msg if $msg =~ /Unknown option/;
};

getopts('adhi:rtuvw:', \my %opts);
if (@ARGV==0 or $opts{'h'} or
   ($opts{'w'} and $opts{'w'} !~ /^\d+$/))
{
die <<"HELP";
  usage: $0 [-dhrtuv -iC -wN] word1 [word2 [word3 [...]]]
  -a   Do not convert tabs to single spaces, does not wrap properly
  -d   Double-spaced, default is single-spaced
  -h   Prints this usage message
  -iC  Prints a divider of character C between entries
  -r   Wrap line at width by inserting \\n, use with -wN
  -t   Use the thesaurus, default is dictionary
  -u   Ugly formattting, translate <br> to space instead of \\n
       But why would you want to do this?
  -v   Verbose, print all entries, default is only the first
  -wN  Set widths of divider and text-wrap to N, default is 80\n
  Examples:
  # all definitions of harvest and posture separated by 80 _'s
  $0 -vi_ harvest posture\n
  # all thesaurus entries for hot separated by 60 +'s
  $0 -tvi+ -w60 hot\n
  # first definitions of cape and toad, double-spaced, separated by 65 \%'s
  # retain tabs, wrap text at column 65, ugly formatting
  $0 -adrui% -w65 cape toad
HELP
}

my $spacing = $opts{'d'} ? "\n\n"      : "\n"        ;
my $div_str = $opts{'i'} ? $opts{'i'}  : '-'         ;
my $book    = $opts{'t'} ? 'Thesaurus' : 'Dictionary';
my $width   = $opts{'w'} ? $opts{'w'}  : 80          ;
my $ugly    = $opts{'u'} ? ' '         : "\n"        ;
my $divider = join '', substr($div_str x $width,0,$width), $spacing;

my $ua   = new LWP::UserAgent or die "Cannot create UserAgent\n";
my $url  = 'http://www.m-w.com/cgi-bin/dictionary';
my ($html, %entity);

while ( my $word = shift ) {
  $html = get_content([ book => $book, va => $word ]);

  if ($html =~ /One entry found for .+?$word/i) {
    $html = join '', $&, $';
    print_entry();
  } elsif ($html =~ /(\d+ entries found for .+?$word)/i) {
    $html = $1;
    html2text(\$html);
    print "$html$spacing";

    $html = join '', $&, $';
    my($list) = $html =~
      /<input type=hidden name=list value="(.+?)">/;
    my @entries = split /=|;/, $list;
    splice @entries, 0, 2; # this entry already displayed
    print_entry();

    while ($opts{'v'} and @entries) {
      $html = get_content([ book     => $book,
                            hdwd     => $word,
                            listword => $word,
                            jump     => $entries[0],
                            list     => $list
                          ]);
      print_entry();
      splice @entries, 0, 2;
    }
  } else {
    print "$word not found in the ", lc $book, " at http://www.m-w.com\n";;
  }
}


sub get_content {
  die "trying to POST an empty request" unless @_;
  my $request  = POST $url, shift;
  my $response = $ua->request($request);
  $response->content;
}


sub print_entry {
  ($html) = $html =~ m!((?:Main Entry|Entry Word):.*?)</form>!is;
  { no warnings 'uninitialized';
    my $x;
    # neat trick that turns
    # text 1 a : definition one b : definition two
    # into
    # text
    # 1a: definition one
    # 1b: definition two
    $html =~
      s!<b>(\d+)?
           (?:\s+)?
           ([a-z])?
        </b>
       !join '', "\n", ($1 and $x=$1 or $x), $2
       !xeg;
  }

  $html =~ s/<br>/$ugly/g unless $opts{'u'};
  $html =~ s/\t/ /g       unless $opts{'a'};
  html2text(\$html);
  # expecting \n, but server sent chr(13), that's what you get for
expectations
  $html = join $spacing, split /[\n\r\x{0A}\x{0D}]+/, $html;
  $html =~ s/^\s+|\s+$//g;
  # substitute only if the present line is greater than width
  my $wrap = $width + 1;
  $html =~ s/(?=.{$wrap})(.{1,$width}) +(?=[^\n])/$1$spacing/g if
$opts{'r'};

  print "$html$spacing";
  print $divider if $opts{'i'};
}


sub html2text {
  # typeglob aliases are supposedly faster than refs to scalars
  our   $htm;
  local *htm = shift;
  return unless length $htm;
  my($begin, $end) = ('<!--', '-->');

  # The three following substitutions and the %entity initialization below
  # were brazenly copied from Tom Christiansens's striphtml (striff tummel)
  # perl script written back in 1996. It can still be found at
  # http://www.perl.com/CPAN-local/authors/Tom_Christiansen/scripts/. I
  # slightly changed the first substitution: it now handles embedded
comments.
  # Hopefully, tags within comments are still handled properly. Otherwise,
  # all else is the same.

  # 1. remove embedded comments
  1 while $htm =~ s/$begin (?:(?!$begin).)*? $end//gxs;
  # 2. remove tags
  $htm =~ s/<(?:[^>'"]+ | ".*?" | '.*?')*?>//gxs;
  load_entity() unless %entity;
  { no warnings 'uninitialized';
    # 3. replace entities with actual characters
    $htm =~ s/(&(\w+ | \x23\d+);?)/$entity{$2} || $1/gxe;
  }
}


sub load_entity {
  # &lt;
  %entity = (
    lt     => '<',     # less-than
    gt     => '>',     # greater-than
    amp    => '&',     # ampersand
    quot   => '"',     # verticle double-quote
    nbsp   => chr 160, # no-break space
    iexcl  => chr 161, # inverted exclamation mark
    cent   => chr 162, # cent sign
    pound  => chr 163, # pound sterling sign CURRENCY NOT WEIGHT
    curren => chr 164, # general currency sign
    yen    => chr 165, # yen sign
    brvbar => chr 166, # broken vertical bar
    sect   => chr 167, # section sign
    uml    => chr 168, # umlaut (dieresis)
    copy   => chr 169, # copyright sign
    ordf   => chr 170, # ordinal indicator, feminine
    laquo  => chr 171, # angle quotation mark, left
    not    => chr 172, # not sign
    shy    => chr 173, # soft hyphen
    reg    => chr 174, # registered sign
    macr   => chr 175, # macron
    deg    => chr 176, # degree sign
    plusmn => chr 177, # plus-or-minus sign
    sup2   => chr 178, # superscript two
    sup3   => chr 179, # superscript three
    acute  => chr 180, # acute accent
    micro  => chr 181, # micro sign
    para   => chr 182, # pilcrow (paragraph sign)
    middot => chr 183, # middle dot
    cedil  => chr 184, # cedilla
    sup1   => chr 185, # superscript one
    ordm   => chr 186, # ordinal indicator, masculine
    raquo  => chr 187, # angle quotation mark, right
    frac14 => chr 188, # fraction one-quarter
    frac12 => chr 189, # fraction one-half
    frac34 => chr 190, # fraction three-quarters
    iquest => chr 191, # inverted question mark
    Agrave => chr 192, # capital A, grave accent
    Aacute => chr 193, # capital A, acute accent
    Acirc  => chr 194, # capital A, circumflex accent
    Atilde => chr 195, # capital A, tilde
    Auml   => chr 196, # capital A, dieresis or umlaut mark
    Aring  => chr 197, # capital A, ring
    AElig  => chr 198, # capital AE diphthong (ligature)
    Ccedil => chr 199, # capital C, cedilla
    Egrave => chr 200, # capital E, grave accent
    Eacute => chr 201, # capital E, acute accent
    Ecirc  => chr 202, # capital E, circumflex accent
    Euml   => chr 203, # capital E, dieresis or umlaut mark
    Igrave => chr 204, # capital I, grave accent
    Iacute => chr 205, # capital I, acute accent
    Icirc  => chr 206, # capital I, circumflex accent
    Iuml   => chr 207, # capital I, dieresis or umlaut mark
    ETH    => chr 208, # capital Eth, Icelandic
    Ntilde => chr 209, # capital N, tilde
    Ograve => chr 210, # capital O, grave accent
    Oacute => chr 211, # capital O, acute accent
    Ocirc  => chr 212, # capital O, circumflex accent
    Otilde => chr 213, # capital O, tilde
    Ouml   => chr 214, # capital O, dieresis or umlaut mark
    times  => chr 215, # multiply sign
    Oslash => chr 216, # capital O, slash
    Ugrave => chr 217, # capital U, grave accent
    Uacute => chr 218, # capital U, acute accent
    Ucirc  => chr 219, # capital U, circumflex accent
    Uuml   => chr 220, # capital U, dieresis or umlaut mark
    Yacute => chr 221, # capital Y, acute accent
    THORN  => chr 222, # capital THORN, Icelandic
    szlig  => chr 223, # small sharp s, German (sz ligature)
    agrave => chr 224, # small a, grave accent
    aacute => chr 225, # small a, acute accent
    acirc  => chr 226, # small a, circumflex accent
    atilde => chr 227, # small a, tilde
    auml   => chr 228, # small a, dieresis or umlaut mark
    aring  => chr 229, # small a, ring
    aelig  => chr 230, # small ae diphthong (ligature)
    ccedil => chr 231, # small c, cedilla
    egrave => chr 232, # small e, grave accent
    eacute => chr 233, # small e, acute accent
    ecirc  => chr 234, # small e, circumflex accent
    euml   => chr 235, # small e, dieresis or umlaut mark
    igrave => chr 236, # small i, grave accent
    iacute => chr 237, # small i, acute accent
    icirc  => chr 238, # small i, circumflex accent
    iuml   => chr 239, # small i, dieresis or umlaut mark
    eth    => chr 240, # small eth, Icelandic
    ntilde => chr 241, # small n, tilde
    ograve => chr 242, # small o, grave accent
    oacute => chr 243, # small o, acute accent
    ocirc  => chr 244, # small o, circumflex accent
    otilde => chr 245, # small o, tilde
    ouml   => chr 246, # small o, dieresis or umlaut mark
    divide => chr 247, # divide sign
    oslash => chr 248, # small o, slash
    ugrave => chr 249, # small u, grave accent
    uacute => chr 250, # small u, acute accent
    ucirc  => chr 251, # small u, circumflex accent
    uuml   => chr 252, # small u, dieresis or umlaut mark
    yacute => chr 253, # small y, acute accent
    thorn  => chr 254, # small thorn, Icelandic
    yuml   => chr 255, # small y, dieresis or umlaut mark
  );

  # &#161;
  for(0..255){
    $entity{'#' . $_} = chr $_;
  }
}

mw.pl
Description: Perl program

-- 
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Command line interface for http://www.m-w.com.

Reply via email to