[perl #124333] Segfault that I can't seem to nail down

via RT Sat, 18 Apr 2015 08:31:30 -0700

# New Ticket Created by   
# Please include the string:  [perl #124333]
# in the subject line of all future correspondence about this issue. 
# <URL: https://rt.perl.org/Ticket/Display.html?id=124333 >



OS: Ubuntu 14.04 LTS under VirtualBox
Host: Windows 8, Intel Core i5

Rakudo: This is perl6 version 2015.03-317-g37ec24f built on MoarVM 
version 2015.03-133-ga300558

The attached file is a radically-trimmed ECMAScript grammar which 
segfaults the p6 compiler routinely. The original is at 
https://github.com/drforr/perl6-ecmascript if you would like to see it. 
By 'radically trimmed' I mean there's little there that resembles the 
original grammar. I suspect the root cause of the problem is the 
UnicodeLetter token. The original grammar is from the ANTLR repository, 
and ANTLR doesn't have a way to represent Unicode properties. Otherwise 
they would have used the moral equivalent of \p{Letter} instead of the 
huge set of alternations.

As it stands, run the enclosed file and the p6 compiler segfaults. Alter 
anything, including removing a test (even those that fail, and there are 
a lot of them) and the segfault goes away. It's taken me the better part 
of a day to trim the example down to this point, and it's now past the 
point of diminishing returns for me to continue to work on it. Basically 
it's waiting for someone that can debug it in a different way than I've 
been doing. My instinct says it's Unicode related, but I'd also look at 
the unaryExpression term. There are a few points of left-recursion in 
the grammar and those are always suspect.

use v6;

grammar Grammar::ECMAScript {

  rule TOP { <sourceElements>  }

  rule sourceElements { <expressionStatement>+ }

  rule functionDeclaration {
    'function' <Identifier> <formalParameterList> <functionBody>
  }

  rule functionExpression {
    'function' <Identifier>?  <formalParameterList> <functionBody>
  }

  rule formalParameterList { '(' <Identifier>* ')' }

  rule functionBody { '{' <sourceElements> '}' }

  rule statementList { <expressionStatement> }

  rule variableDeclarationList { <variableDeclaration>+ % ','  }

  rule variableDeclaration { <Identifier> <initialiser>?  }

  rule variableDeclarationNoIn { <Identifier> <initialiserNoIn>?  }

  rule initialiser { '=' <assignmentExpression> }

  rule initialiserNoIn { '=' <assignmentExpressionNoIn> }

  rule expressionStatement { <expression> ';' }

  rule whileStatement { 'while' '(' <expression> ')' <expressionStatement> }

  rule forStatementInitialiserPart { <expressionNoIn> }

  rule forInStatement {
    'for' '(' <leftHandSideExpression> 'in' <expression> ')'
  }

  rule forInStatementInitialiserPart { <leftHandSideExpression> }

  rule returnStatement { 'return' <expression>? ';' }

  rule withStatement { 'with' '(' <expression> ')'  <expressionStatement> }

  rule labelledStatement { <Identifier> ':' <expressionStatement> }

  rule throwStatement { 'throw' <expression> }

  rule expression { <assignmentExpression>+ % ',' }

  rule expressionNoIn { <assignmentExpressionNoIn>+ % ',' }

  rule assignmentExpression {
    <conditionalExpression> | <leftHandSideExpression> '='
  }

  rule assignmentExpressionNoIn { <conditionalExpressionNoIn> }

  rule leftHandSideExpression {
    <primaryExpression> <arguments> | <primaryExpression>
  }

  rule arguments { '(' <assignmentExpression>* % ',' ')' }

  rule indexSuffix { '[' <expression> ']' }

  rule propertyReferenceSuffix { '.' <Identifier> }

  rule conditionalExpression { <relationalExpression> }

  rule relationalExpression { <shiftExpression> }

  rule shiftExpression { <multiplicativeExpression>+ }

  rule multiplicativeExpression { <unaryExpression> }

  rule conditionalExpressionNoIn { <logicalORExpressionNoIn> }

  rule logicalORExpressionNoIn { <relationalExpressionNoIn> }

  rule relationalExpressionNoIn { <shiftExpression> }

  rule unaryExpression {
    <postfixExpression>
  | [ 'delete' | 'void' | 'typeof' | '++' | '--' | '+' | '-' | '~' | '!' ]
    <unaryExpression>
  }

  rule postfixExpression { <leftHandSideExpression> [ '++' | '--' ]?  }

  rule primaryExpression {
  | <Identifier>
  | <literal>
  | '(' <expression> ')'
  }

  token literal { '.'? <[0..9]> }

  token Identifier { <UnicodeLetter>+ }

  token UnicodeLetter {
    <[\x[0041]..\x[005A]]>
  | <[\x[0061]..\x[007A]]>
  | <[\x[0388]..\x[038A]]>
  | <[\x[038E]..\x[03A1]]>
  | <[\x[03A3]..\x[03CE]]>
  | <[\x[03D0]..\x[03D7]]>
  | <[\x[03DA]..\x[03F3]]>
  | <[\x[0400]..\x[0481]]>
  | <[\x[048C]..\x[04C4]]>
  | <[\x[04C7]..\x[04C8]]>
  | <[\x[04CB]..\x[04CC]]>
  | <[\x[04D0]..\x[04F5]]>
  | <[\x[04F8]..\x[04F9]]>
  | <[\x[0531]..\x[0556]]>
  | <[\x[06E5]..\x[06E6]]>
  | <[\x[06FA]..\x[06FC]]>
  | <[\x[1312]..\x[1315]]>
  | <[\x[1318]..\x[131E]]>
  | <[\x[1320]..\x[1346]]>
  | <[\x[1348]..\x[135A]]>
  | <[\x[13A0]..\x[13B0]]>
  | <[\x[13B1]..\x[13F4]]>
  | <[\x[1401]..\x[1676]]>
  | <[\x[1681]..\x[169A]]>
  | <[\x[16A0]..\x[16EA]]>
  | <[\x[1780]..\x[17B3]]>
  | <[\x[1820]..\x[1877]]>
  | <[\x[1880]..\x[18A8]]>
  | <[\x[1E00]..\x[1E9B]]>
  | <[\x[1EA0]..\x[1EE0]]>
  | <[\x[1EE1]..\x[1EF9]]>
  | <[\x[1F00]..\x[1F15]]>
  | <[\x[1F18]..\x[1F1D]]>
  | <[\x[1F20]..\x[1F39]]>
  | <[\x[1F3A]..\x[1F45]]>
  | <[\x[1F48]..\x[1F4D]]>
  | <[\x[1F50]..\x[1F57]]>
  | <[\x[210A]..\x[2113]]>
  | <[\x[2119]..\x[211D]]>
  | <[\x[212A]..\x[212D]]>
  | <[\x[212F]..\x[2131]]>
  | <[\x[2133]..\x[2139]]>
  | <[\x[2160]..\x[2183]]>
  | <[\x[3005]..\x[3007]]>
  | <[\x[3021]..\x[3029]]>
  | <[\x[3031]..\x[3035]]>
  | <[\x[3038]..\x[303A]]>
  | <[\x[3041]..\x[3094]]>
  | <[\x[309D]..\x[309E]]>
  | <[\x[30A1]..\x[30FA]]>
  | <[\x[30FC]..\x[30FE]]>
  | <[\x[3105]..\x[312C]]>
  | <[\x[3131]..\x[318E]]>
  | <[\x[31A0]..\x[31B7]]>
  | <[\x[A000]..\x[A48C]]>
  | <[\x[F900]..\x[FA2D]]>
  | <[\x[FB00]..\x[FB06]]>
  | <[\x[FB13]..\x[FB17]]>
  | <[\x[FB1F]..\x[FB28]]>
  | <[\x[FB2A]..\x[FB36]]>
  | <[\x[FB38]..\x[FB3C]]>
  | <[\x[FB40]..\x[FB41]]>
  | <[\x[FB43]..\x[FB44]]>
  | <[\x[FB46]..\x[FBB1]]>
  | <[\x[FBD3]..\x[FD3D]]>
  | <[\x[FD50]..\x[FD8F]]>
  | <[\x[FD92]..\x[FDC7]]>
  | <[\x[FDF0]..\x[FDFB]]>
  | <[\x[FE70]..\x[FE72]]>
  | <[\x[FE76]..\x[FEFC]]>
  | <[\x[FF21]..\x[FF3A]]>
  | <[\x[FF41]..\x[FF5A]]>
  | <[\x[FF66]..\x[FFBE]]>
  | <[\x[FFC2]..\x[FFC7]]>
  | <[\x[FFCA]..\x[FFCF]]>
  | <[\x[FFD2]..\x[FFD7]]>
  | <[\x[FFDA]..\x[FFDC]]>
  | \x[038C]
  | \x[0559]
  | \x[06D5]
  | \x[0710]
  | \x[1310]
  | \x[2115]
  | \x[2124]
  | \x[2126]
  | \x[2128]
  | \x[3400]
  | \x[4DB5]
  | \x[4E00]
  | \x[9FA5]
  | \x[AC00]
  | \x[D7A3]
  | \x[FB1D]
  | \x[FB3E]
  | \x[FE74]
  }
}

use Test;

my $g = Grammar::ECMAScript.new;

##############################################################################

ok  $g.parse( q{for(var a in b)a++;}, rule => 'forInStatement' );
ok  $g.parse( q{var a}, rule => 'forInStatementInitialiserPart' );
ok  $g.parse( q{(a)}, rule => 'formalParameterList' );
ok  $g.parse( q{( a)}, rule => 'formalParameterList' );
ok  $g.parse( q{( $a, \u0000a )}, rule => 'formalParameterList' );
ok  $g.parse( q{var a}, rule => 'forStatementInitialiserPart' );
ok  $g.parse( q{function a(){a++;}}, rule => 'functionDeclaration' );
ok  $g.parse( q{function(){a++;}}, rule => 'functionExpression' );
ok  $g.parse( q{a}, rule => 'Identifier' );
ok  $g.parse( q{ab}, rule => 'Identifier' );
ok  $g.parse( q{$a}, rule => 'Identifier' );
ok  $g.parse( q{a_}, rule => 'Identifier' );
ok  $g.parse( q{\u0000a}, rule => 'Identifier' );
ok  $g.parse( q{[a]}, rule => 'indexSuffix' );
ok  $g.parse( qq{[\na]}, rule => 'indexSuffix' );
ok  $g.parse( qq{[a\n]}, rule => 'indexSuffix' );
ok  $g.parse( qq{[\na\n]}, rule => 'indexSuffix' );
ok  $g.parse( q{=a in b}, rule => 'initialiser' );
nok $g.parse( q{=a in b}, rule => 'initialiserNoIn' );
ok  $g.parse( q{a:b;}, rule => 'labelledStatement' );
nok $g.parse( q{a||b in c}, rule => 'logicalORExpressionNoIn' );
ok  $g.parse( q{a||b}, rule => 'logicalORExpressionNoIn' );
ok  $g.parse( q{a*b}, rule => 'multiplicativeExpression' );
ok  $g.parse( q{a++}, rule => 'postfixExpression' );
ok  $g.parse( q{.a_}, rule => 'propertyReferenceSuffix' );
ok  $g.parse( q{. a_}, rule => 'propertyReferenceSuffix' );
ok  $g.parse( qq{.\na_}, rule => 'propertyReferenceSuffix' );
ok  $g.parse( q{a<b}, rule => 'relationalExpression' );
ok  $g.parse( q{a in b}, rule => 'relationalExpression' );
nok $g.parse( q{a in b}, rule => 'relationalExpressionNoIn' );
ok  $g.parse( q{a<b}, rule => 'relationalExpressionNoIn' );
ok  $g.parse( q{return a_;}, rule => 'returnStatement' );
ok  $g.parse( qq{return a_\n}, rule => 'returnStatement' );
ok  $g.parse( q{a<<b}, rule => 'shiftExpression' );
ok  $g.parse( q{a++;}, rule => 'sourceElements' );
ok  $g.parse( qq{a++;\nb<3;}, rule => 'sourceElements' );
ok  $g.parse( q{a++;}, rule => 'statementList' );
ok  $g.parse( q{throw a_;}, rule => 'throwStatement' );
ok  $g.parse( qq{throw a_\n}, rule => 'throwStatement' );
ok  $g.parse( q{a++;}, rule => 'TOP' );
ok  $g.parse( q{void a}, rule => 'unaryExpression' );
nok $g.parse( q{}, rule => 'UnicodeLetter' );
nok $g.parse( q{9}, rule => 'UnicodeLetter' );
nok $g.parse( qq{\x[00ab]}, rule => 'UnicodeLetter' );
ok  $g.parse( q{a}, rule => 'UnicodeLetter' );
ok  $g.parse( qq{\x[04cb]}, rule => 'UnicodeLetter' );
ok  $g.parse( q{a=1}, rule => 'variableDeclarationList' );
ok  $g.parse( q{a=1,b=a}, rule => 'variableDeclarationList' );
ok  $g.parse( q{A}, rule => 'variableDeclarationNoIn' );
ok  $g.parse( q{A=1}, rule => 'variableDeclarationNoIn' );
ok  $g.parse( q{a=c in b}, rule => 'variableDeclarationNoIn' );
ok  $g.parse( q{while(1)a++;}, rule => 'whileStatement' );
ok  $g.parse( q{with(1)a++;}, rule => 'withStatement' );

[perl #124333] Segfault that I can't seem to nail down

Reply via email to