This is an automated email from the git hooks/post-receive script. js pushed a commit to tag PEVANS in repository libparser-mgc-perl.
commit 4b07f754778c02108e76635312d3c4ad089b107a Author: Paul Evans <leon...@leonerd.org.uk> Date: Wed Mar 23 21:18:26 2011 +0000 Import of PEVANS/Parser-MGC-0.08 from CPAN. gitpan-cpan-distribution: Parser-MGC gitpan-cpan-version: 0.08 gitpan-cpan-path: PEVANS/Parser-MGC-0.08.tar.gz gitpan-cpan-author: PEVANS gitpan-cpan-maturity: released --- Build.PL | 1 + Changes | 7 ++ MANIFEST | 3 + META.yml | 5 +- Makefile.PL | 1 + README | 44 +++++++++++-- examples/LICENSE | 25 +++++++ lib/Parser/MGC.pm | 185 ++++++++++++++++++++++++++++++++++++---------------- t/07generic_token.t | 32 +++++++++ t/12token_string.t | 16 +++-- t/13token_ident.t | 2 +- t/23sequence_of.t | 32 ++++++++- t/30commit.t | 32 ++++++++- t/32exception.t | 57 ++++++++++++++++ 14 files changed, 370 insertions(+), 72 deletions(-) diff --git a/Build.PL b/Build.PL index 45549a9..37b3b3e 100644 --- a/Build.PL +++ b/Build.PL @@ -9,6 +9,7 @@ my $build = Module::Build->new( 'File::Slurp' => 0, }, build_requires => { + 'File::Temp' => 0, 'Test::More' => 0, }, license => 'perl', diff --git a/Changes b/Changes index 4ef1d8e..028062a 100644 --- a/Changes +++ b/Changes @@ -1,5 +1,12 @@ Revision history for Parser-MGC +0.08 CHANGES: + * Give ->list_of and ->sequence_of proper failure-handling semantics + * Added ->generic_token + * Defer conversion of pos into line/col/text until string-formatting + a failure exception - improves performance of backtracking + * Make token_float tuneable + 0.07 CHANGES: * Allow ->expect to return subgroup captures in list context * Documentation improvements diff --git a/MANIFEST b/MANIFEST index 33ae696..680ae36 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,6 +1,7 @@ Build.PL Changes examples/eval-expr.pl +examples/LICENSE examples/parse-dict.pl examples/parse-pod.pl examples/synopsis.pl @@ -17,6 +18,7 @@ t/03reader.t t/04where.t t/05comment.t t/06substring.t +t/07generic_token.t t/10token_int.t t/11token_float.t t/12token_string.t @@ -29,6 +31,7 @@ t/23sequence_of.t t/24any_of.t t/30commit.t t/31scope_level.t +t/32exception.t t/90ex_dict.t t/90ex_expr.t t/90ex_pod.t diff --git a/META.yml b/META.yml index b3e7f51..ab5e98d 100644 --- a/META.yml +++ b/META.yml @@ -3,6 +3,7 @@ abstract: 'build simple recursive-descent parsers' author: - 'Paul Evans <leon...@leonerd.org.uk>' build_requires: + File::Temp: 0 Test::More: 0 configure_requires: Module::Build: 0.36 @@ -15,9 +16,9 @@ name: Parser-MGC provides: Parser::MGC: file: lib/Parser/MGC.pm - version: 0.07 + version: 0.08 requires: File::Slurp: 0 resources: license: http://dev.perl.org/licenses/ -version: 0.07 +version: 0.08 diff --git a/Makefile.PL b/Makefile.PL index e9136a7..50ff6cc 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -6,6 +6,7 @@ WriteMakefile 'VERSION_FROM' => 'lib/Parser/MGC.pm', 'PREREQ_PM' => { 'File::Slurp' => 0, + 'File::Temp' => 0, 'Test::More' => 0 }, 'INSTALLDIRS' => 'site', diff --git a/README b/README index 20d6c24..b293800 100644 --- a/README +++ b/README @@ -72,8 +72,13 @@ PATTERNS * int Pattern used to parse an integer by "token_int". Defaults to - "/0x[[:xdigit:]]+|[[:digit:]]+/". If "accept_0o_oct" is given, then - this will be expanded to match "/0o[0-7]+/" as well. + "/-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/". If "accept_0o_oct" is given, + then this will be expanded to match "/0o[0-7]+/" as well. + + * float + + Pattern used to parse a floating-point number by "token_float". + Defaults to "/-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i". * ident @@ -243,9 +248,10 @@ STRUCTURE-FORMING METHODS $parser->commit Calling this method will cancel the backtracking behaviour of the - innermost "maybe" or "any_of" structure forming method. That is, if - later code then calls "fail", the exception will be propagated out of - "maybe", and no further code blocks will be attempted by "any_of". + innermost "maybe", "list_of", "sequence_of", or "any_of" structure + forming method. That is, if later code then calls "fail", the exception + will be propagated out of "maybe", and no further code blocks will be + attempted by "any_of". Typically this will be called once the grammatical structure of an alternation has been determined, ensuring that any further failures are @@ -311,6 +317,29 @@ TOKEN PARSING METHODS parser to read quoted strings, or similar cases where whitespace should be preserved. + $val = $parser->generic_token( $name, $re, $convert ) + Expects to find a token matching the precompiled regexp $re. If + provided, the $convert CODE reference can be used to convert the string + into a more convenient form. $name is used in the failure message if the + pattern fails to match. + + If provided, the $convert function will be passed the parser and the + matching substring; the value it returns is returned from + "generic_token". + + $convert->( $parser, $substr ) + + If not provided, the substring will be returned as it stands. + + This method is mostly provided for subclasses to define their own token + types. For example: + + sub token_hex + { + my $self = shift; + $self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } ); + } + $int = $parser->token_int Expects to find an integer in decimal, octal or hexadecimal notation, and consumes it. Negative integers, preceeded by "-", are also @@ -387,7 +416,10 @@ TODO consider instead a "parse_string_generic" using a loop over "substring_before". - * Easy ability for subclasses to define more token types + * Easy ability for subclasses to define more token types as methods. + Perhaps provide a class method such as + + __PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } ); * Investigate how well "from_reader" can cope with buffer splitting across other tokens than simply skippable whitespace diff --git a/examples/LICENSE b/examples/LICENSE new file mode 100644 index 0000000..7ba5db2 --- /dev/null +++ b/examples/LICENSE @@ -0,0 +1,25 @@ +The following licence applies to the example scripts in this directory +---------------------------------------------------------------------- + + +The MIT License + +Copyright (c) 2011 Paul Evans <leon...@leonerd.org.uk> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/lib/Parser/MGC.pm b/lib/Parser/MGC.pm index 2a90c14..c9748ad 100644 --- a/lib/Parser/MGC.pm +++ b/lib/Parser/MGC.pm @@ -8,7 +8,7 @@ package Parser::MGC; use strict; use warnings; -our $VERSION = '0.07'; +our $VERSION = '0.08'; use Carp; @@ -106,8 +106,13 @@ Pattern used to skip comments between tokens. Undefined by default. =item * int Pattern used to parse an integer by C<token_int>. Defaults to -C</0x[[:xdigit:]]+|[[:digit:]]+/>. If C<accept_0o_oct> is given, then this -will be expanded to match C</0o[0-7]+/> as well. +C</-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/>. If C<accept_0o_oct> is given, then +this will be expanded to match C</0o[0-7]+/> as well. + +=item * float + +Pattern used to parse a floating-point number by C<token_float>. Defaults to +C</-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i>. =item * ident @@ -126,17 +131,17 @@ my @patterns = qw( ws comment int + float ident string_delim ); -use constant { - pattern_ws => qr/[\s\n\t]+/, - pattern_comment => undef, - pattern_int => qr/0x[[:xdigit:]]+|[[:digit:]]+/, - pattern_ident => qr/[[:alpha:]_]\w*/, - pattern_string_delim => qr/["']/, -}; +use constant pattern_ws => qr/[\s\n\t]+/; +use constant pattern_comment => undef; +use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/; +use constant pattern_float => qr/-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i; +use constant pattern_ident => qr/[[:alpha:]_]\w*/; +use constant pattern_string_delim => qr/["']/; sub new { @@ -253,8 +258,10 @@ column is numbered 0. sub where { my $self = shift; + my ( $pos ) = @_; + + defined $pos or $pos = pos $self->{str}; - my $pos = pos $self->{str}; my $str = $self->{str}; my $sol = $pos; @@ -285,7 +292,7 @@ sub fail my $self = shift; my ( $message ) = @_; - die Parser::MGC::Failure->new( $message, $self->where ); + die Parser::MGC::Failure->new( $message, $self, pos($self->{str}) ); } =head2 $eos = $parser->at_eos @@ -457,15 +464,29 @@ sub list_of my $self = shift; my ( $sep, $code ) = @_; - ref $sep or $sep = qr/\Q$sep/; + ref $sep or $sep = qr/\Q$sep/ if defined $sep; + + my $committed; + local $self->{committer} = sub { $committed++ }; my @ret; while( !$self->at_eos ) { - push @ret, $code->( $self ); + $committed = 0; + my $pos = pos $self->{str}; - $self->skip_ws; - $self->{str} =~ m/\G$sep/gc or last; + eval { push @ret, $code->( $self ); 1 } and next; + my $e = $@; + + pos($self->{str}) = $pos; + die $e if $committed or not eval { $e->isa( "Parser::MGC::Failure" ) }; + last; + } + continue { + if( defined $sep ) { + $self->skip_ws; + $self->{str} =~ m/\G$sep/gc or last; + } } return \@ret; @@ -493,13 +514,7 @@ sub sequence_of my $self = shift; my ( $code ) = @_; - my @ret; - - while( !$self->at_eos ) { - push @ret, $code->( $self ); - } - - return \@ret; + $self->list_of( undef, $code ); } =head2 $ret = $parser->any_of( @codes ) @@ -556,9 +571,9 @@ sub any_of =head2 $parser->commit Calling this method will cancel the backtracking behaviour of the innermost -C<maybe> or C<any_of> structure forming method. That is, if later code then -calls C<fail>, the exception will be propagated out of C<maybe>, and no -further code blocks will be attempted by C<any_of>. +C<maybe>, C<list_of>, C<sequence_of>, or C<any_of> structure forming method. +That is, if later code then calls C<fail>, the exception will be propagated +out of C<maybe>, and no further code blocks will be attempted by C<any_of>. Typically this will be called once the grammatical structure of an alternation has been determined, ensuring that any further failures are raised @@ -717,6 +732,59 @@ sub substring_before return substr( $self->{str}, $start, $end - $start ); } +=head2 $val = $parser->generic_token( $name, $re, $convert ) + +Expects to find a token matching the precompiled regexp C<$re>. If provided, +the C<$convert> CODE reference can be used to convert the string into a more +convenient form. C<$name> is used in the failure message if the pattern fails +to match. + +If provided, the C<$convert> function will be passed the parser and the +matching substring; the value it returns is returned from C<generic_token>. + + $convert->( $parser, $substr ) + +If not provided, the substring will be returned as it stands. + +This method is mostly provided for subclasses to define their own token types. +For example: + + sub token_hex + { + my $self = shift; + $self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } ); + } + +=cut + +sub generic_token +{ + my $self = shift; + my ( $name, $re, $convert ) = @_; + + $self->fail( "Expected $name" ) if $self->at_eos; + + $self->skip_ws; + $self->{str} =~ m/\G$re/gc or + $self->fail( "Expected $name" ); + + my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] ); + + return $convert ? $convert->( $self, $match ) : $match; +} + +sub _token_generic +{ + my $self = shift; + my %args = @_; + + my $name = $args{name}; + my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re}; + my $convert = $args{convert}; + + $self->generic_token( $name, $re, $convert ); +} + =head2 $int = $parser->token_int Expects to find an integer in decimal, octal or hexadecimal notation, and @@ -727,20 +795,20 @@ consumes it. Negative integers, preceeded by C<->, are also recognised. sub token_int { my $self = shift; + $self->_token_generic( + name => "int", - $self->fail( "Expected integer" ) if $self->at_eos; - - $self->skip_ws; - $self->{str} =~ m/\G(-?)($self->{patterns}{int})/gc or - $self->fail( "Expected integer" ); + pattern => "int", + convert => sub { + my $int = $_[1]; + my $sign = ( $int =~ s/^-// ) ? -1 : 1; - my $sign = $1 ? -1 : 1; - my $int = $2; + $int =~ s/^0o/0/; - $int =~ s/^0o/0/; - - return $sign * oct $int if $int =~ m/^0/; - return $sign * $int; + return $sign * oct $int if $int =~ m/^0/; + return $sign * $int; + }, + ); } =head2 $float = $parser->token_float @@ -755,14 +823,12 @@ numerical value is then returned. sub token_float { my $self = shift; + $self->_token_generic( + name => "float", - $self->fail( "Expected float" ) if $self->at_eos; - - $self->skip_ws; - $self->{str} =~ m/\G(-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+)/gci or - $self->fail( "Expected float" ); - - return $1 + 0; + pattern => "float", + convert => sub { $_[1] + 0 }, + ); } =head2 $str = $parser->token_string @@ -845,14 +911,11 @@ Expects to find an identifier, and consumes it. sub token_ident { my $self = shift; + $self->_token_generic( + name => "ident", - $self->fail( "Expected identifier" ) if $self->at_eos; - - $self->skip_ws; - $self->{str} =~ m/\G($self->{patterns}{ident})/gc or - $self->fail( "Expected identifier" ); - - return $1; + pattern => "ident", + ); } =head2 $keyword = $parser->token_kw( @keywords ) @@ -887,7 +950,7 @@ sub new { my $class = shift; my $self = bless {}, $class; - @{$self}{qw( message linenum col text )} = @_; + @{$self}{qw( message parser pos )} = @_; return $self; } @@ -896,15 +959,22 @@ sub STRING { my $self = shift; + my $parser = $self->{parser}; + my ( $linenum, $col, $text ) = $parser->where( $self->{pos} ); + # Column number only counts characters. There may be tabs in there. # Rather than trying to calculate the visual column number, just print the # indentation as it stands. - my $indent = substr( $self->{text}, 0, $self->{col} ); + my $indent = substr( $text, 0, $col ); $indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace - return "$self->{message} on line $self->{linenum} at:\n" . - "$self->{text}\n" . + my $filename = $parser->{filename}; + my $in_file = ( defined $filename and !ref $filename ) + ? "in $filename " : ""; + + return "$self->{message} ${in_file}on line $linenum at:\n" . + "$text\n" . "$indent^\n"; } @@ -954,7 +1024,10 @@ instead a C<parse_string_generic> using a loop over C<substring_before>. =item * -Easy ability for subclasses to define more token types +Easy ability for subclasses to define more token types as methods. Perhaps +provide a class method such as + + __PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } ); =item * diff --git a/t/07generic_token.t b/t/07generic_token.t new file mode 100644 index 0000000..a82903e --- /dev/null +++ b/t/07generic_token.t @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w + +use strict; + +use Test::More tests => 4; + +package TestParser; +use base qw( Parser::MGC ); + +my $re; +my $convert; + +sub parse +{ + my $self = shift; + + return $self->generic_token( token => $re, $convert ); +} + +package main; + +my $parser = TestParser->new; + +$re = qr/[A-Z]+/; +is( $parser->from_string( "HELLO" ), "HELLO", 'Simple RE' ); +ok( !eval { $parser->from_string( "hello" ) }, 'Simple RE fails' ); + +$re = qr/[A-Z]+/i; +is( $parser->from_string( "Hello" ), "Hello", 'RE with flags' ); + +$convert = sub { lc $_[1] }; +is( $parser->from_string( "Hello" ), "hello", 'Conversion function' ); diff --git a/t/12token_string.t b/t/12token_string.t index 39c3eba..64f1a30 100644 --- a/t/12token_string.t +++ b/t/12token_string.t @@ -14,6 +14,16 @@ sub parse return $self->token_string; } +package StringPairParser; +use base qw( Parser::MGC ); + +sub parse +{ + my $self = shift; + + return [ $self->token_string, $self->token_string ]; +} + package main; my $parser = TestParser->new; @@ -46,11 +56,7 @@ $parser = TestParser->new( is( $parser->from_string( q["double"] ), "double", 'Double quoted string still passes' ); ok( !eval { $parser->from_string( q['single'] ) }, 'Single quoted string now fails' ); -no warnings 'redefine'; -local *TestParser::parse = sub { - my $self = shift; - return [ $self->token_string, $self->token_string ]; -}; +$parser = StringPairParser->new; is_deeply( $parser->from_string( q["foo" "bar"] ), [ "foo", "bar" ], diff --git a/t/13token_ident.t b/t/13token_ident.t index 4785131..50d42f1 100644 --- a/t/13token_ident.t +++ b/t/13token_ident.t @@ -24,7 +24,7 @@ is( $parser->from_string( "x" ), "x", 'Single-letter identifier' ); ok( !eval { $parser->from_string( "123" ) }, '"123" fails' ); is( $@, - qq[Expected identifier on line 1 at:\n] . + qq[Expected ident on line 1 at:\n] . qq[123\n] . qq[^\n], 'Exception from "123" failure' ); diff --git a/t/23sequence_of.t b/t/23sequence_of.t index e91a6e4..deaf7a3 100644 --- a/t/23sequence_of.t +++ b/t/23sequence_of.t @@ -2,7 +2,7 @@ use strict; -use Test::More tests => 2; +use Test::More tests => 6; package TestParser; use base qw( Parser::MGC ); @@ -16,9 +16,39 @@ sub parse } ); } +package IntThenStringParser; +use base qw( Parser::MGC ); + +sub parse +{ + my $self = shift; + + [ $self->sequence_of( sub { + return $self->token_int; + } ), + + $self->sequence_of( sub { + return $self->token_string; + } ), + ]; +} + package main; my $parser = TestParser->new; is_deeply( $parser->from_string( "123" ), [ 123 ], '"123"' ); is_deeply( $parser->from_string( "4 5 6" ), [ 4, 5, 6 ], '"4 5 6"' ); + +is_deeply( $parser->from_string( "" ), [], '""' ); + +$parser = IntThenStringParser->new; + +is_deeply( $parser->from_string( "10 20 'ab' 'cd'" ), + [ [ 10, 20 ], [ 'ab', 'cd' ] ], q("10 20 'ab' 'cd'") ); + +is_deeply( $parser->from_string( "10 20" ), + [ [ 10, 20 ], [] ], q("10 20") ); + +is_deeply( $parser->from_string( "'ab' 'cd'" ), + [ [], [ 'ab', 'cd' ] ], q("'ab' 'cd'") ); diff --git a/t/30commit.t b/t/30commit.t index 77fe010..c7207c0 100644 --- a/t/30commit.t +++ b/t/30commit.t @@ -2,7 +2,7 @@ use strict; -use Test::More tests => 4; +use Test::More tests => 7; package TestParser; use base qw( Parser::MGC ); @@ -24,6 +24,23 @@ sub parse ); } +package IntStringPairsParser; +use base qw( Parser::MGC ); + +sub parse +{ + my $self = shift; + + $self->sequence_of( sub { + my $int = $self->token_int; + $self->commit; + + my $str = $self->token_string; + + [ $int, $str ]; + } ); +} + package main; my $parser = TestParser->new; @@ -37,3 +54,16 @@ is( $@, qq[(456)\n]. qq[ ^\n], 'Exception from "(456)" failure' ); + +$parser = IntStringPairsParser->new; + +is_deeply( $parser->from_string( "1 'one' 2 'two'" ), + [ [ 1, "one" ], [ 2, "two" ] ], + "1 'one' 2 'two'" ); + +ok( !eval { $parser->from_string( "1 'one' 2" ) }, "1 'one' 2 fails" ); +is( $@, + qq[Expected string on line 1 at:\n]. + qq[1 'one' 2\n]. + qq[ ^\n], + 'Exception from 1 \'one\' 2 failure' ); diff --git a/t/32exception.t b/t/32exception.t new file mode 100644 index 0000000..86455b0 --- /dev/null +++ b/t/32exception.t @@ -0,0 +1,57 @@ +#!/usr/bin/perl -w + +use strict; + +use Test::More tests => 9; +use File::Temp qw( tempfile ); + +package TestParser; +use base qw( Parser::MGC ); + +sub parse +{ + my $self = shift; + + return $self->token_int; +} + +package main; + +my $parser = TestParser->new; + +isa_ok( $parser, "TestParser", '$parser' ); +isa_ok( $parser, "Parser::MGC", '$parser' ); + +my $value = $parser->from_string( "\t123" ); + +is( $value, 123, '->from_string' ); + +ok( !eval { $parser->from_string( "\t123." ) }, 'Trailing input on string fails' ); +is( $@, + qq[Expected end of input on line 1 at:\n]. + qq[\t123.\n]. + qq[\t ^\n], + 'Exception from trailing input on string' ); + +ok( !eval { $parser->from_file( \*DATA ) }, 'Trailing input on glob filehandle fails' ); +is( $@, + qq[Expected end of input on line 1 at:\n]. + qq[ 123.\n]. + qq[ ^\n], + 'Exception from trailing input on glob filehandle' ); + +my ( $fh, $filename ) = tempfile( "tmpfile.XXXXXX", UNLINK => 1 ); +END { defined $filename and unlink $filename } + +print $fh " 123.\n"; +close $fh; + +ok( !eval { $parser->from_file( $filename ) }, 'Trailing input on named file fails' ); +is( $@, + qq[Expected end of input in $filename on line 1 at:\n]. + qq[ 123.\n]. + qq[ ^\n], + 'Exception from trailing input on named file' ); + +__DATA__ + 123. -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libparser-mgc-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list Pkg-perl-cvs-commits@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits