Revision: 17169
Author:   [email protected]
Date:     Fri Oct 11 15:38:49 2013 UTC
Log:      Minimal push mode scanner around the experimental rules.

- Had to extend the rules a bit (it wouldn't compile).
- The rules are still mostly toy rules (see the number recognition for example).

BUG=
[email protected]

Review URL: https://codereview.chromium.org/26531003
http://code.google.com/p/v8/source/detail?r=17169

Modified:
 /branches/experimental/parser/src/lexer/lexer.re

=======================================
--- /branches/experimental/parser/src/lexer/lexer.re Fri Oct 11 08:30:53 2013 UTC +++ /branches/experimental/parser/src/lexer/lexer.re Fri Oct 11 15:38:49 2013 UTC
@@ -1,128 +1,460 @@
-/*!re2c
+#include <fcntl.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>

+/*!types:re2c */

-  re2c:define:YYCTYPE  = "uint8_t";
-  re2c:define:YYCURSOR = p;
-  re2c:yyfill:enable   = 0;
-  re2c:yych:conversion = 0;
-  re2c:indent:top      = 1;
+#if defined(WIN32)

+    typedef signed char     int8_t;
+    typedef signed short    int16_t;
+    typedef signed int      int32_t;

-  eof = [\000];
-  any = [\000-\377];
-  whitespace_char = [ \h\t\v\f\r];
-  whitespace = whitespace_char+;
-  identifier_start = [$_\\a-zA-z];
-  identifier_char = [$_\\a-zA-z0-9];
-  number_start = [0-9];
-  line_terminator = [\n\r]+;
+    typedef unsigned char   uint8_t;
+    typedef unsigned short  uint16_t;
+    typedef unsigned int    uint32_t;

+#else

-  <NORMAL> "("                     { PUSH_T(LPAREN); }
-  <NORMAL> ")"                     { PUSH_T(RPAREN); }
-  <NORMAL> "["                     { PUSH_T(LBRACK); }
-  <NORMAL> "]"                     { PUSH_T(RBRACK); }
-  <NORMAL> "{"                     { PUSH_T(LBRACE); }
-  <NORMAL> "}"                     { PUSH_T(RBRACE); }
-  <NORMAL> ":"                     { PUSH_T(COLON); }
-  <NORMAL> ";"                     { PUSH_T(SEMICOLON); }
-  <NORMAL> "."                     { PUSH_T(PERIOD); }
-  <NORMAL> "?"                     { PUSH_T(CONDITIONAL); }
-  <NORMAL> "++"                    { PUSH_T(INC); }
-  <NORMAL> "--"                    { PUSH_T(DEC); }
+    #include <stdint.h>
+    #include <unistd.h>

+    #ifndef O_BINARY
+        #define O_BINARY 0
+    #endif

-  <NORMAL> "="                     { PUSH_T(ASSIGN); }
-  <NORMAL> "|="                    { PUSH_T(ASSIGN_BIT_OR); }
-  <NORMAL> "^="                    { PUSH_T(ASSIGN_BIT_XOR); }
-  <NORMAL> "&="                    { PUSH_T(ASSIGN_BIT_AND); }
-  <NORMAL> "<<="                   { PUSH_T(ASSIGN_SHL); }
-  <NORMAL> ">>="                   { PUSH_T(ASSIGN_SAR); }
-  <NORMAL> ">>>="                  { PUSH_T(ASSIGN_SHR); }
-  <NORMAL> "+="                    { PUSH_T(ASSIGN_ADD); }
-  <NORMAL> "-="                    { PUSH_T(ASSIGN_SUB); }
-  <NORMAL> "*="                    { PUSH_T(ASSIGN_MUL); }
-  <NORMAL> "/="                    { PUSH_T(ASSIGN_DIV); }
-  <NORMAL> "%="                    { PUSH_T(ASSIGN_MOD); }
+#endif

+// ----------------------------------------------------------------------
+#define PUSH_EOS(T) { printf("got eos\n"); }
+#define PUSH_T(T) { printf("got token %d\n", T); SKIP(); }
+#define PUSH_STRING() { printf("got string\n"); SKIP(); }
+#define PUSH_NUMBER() { printf("got number\n"); SKIP(); }
+#define PUSH_IDENTIFIER() { \
+        printf("got identifier: "); \
+        size_t tokenSize = cursor-start; \
+        fwrite(start, tokenSize, 1, stdout); \
+        printf("\n"); \
+        SKIP(); }
+#define PUSH_LINE_TERMINATOR() { printf("got line terminator\n"); SKIP();}
+#define TERMINATE_ILLEGAL() { return 1; }

-  <NORMAL> ","                     { PUSH_T(COMMA); }
-  <NORMAL> "||"                    { PUSH_T(OR); }
-  <NORMAL> "&&"                    { PUSH_T(AND); }
-  <NORMAL> "|"                     { PUSH_T(BIT_OR); }
-  <NORMAL> "^"                     { PUSH_T(BIT_XOR); }
-  <NORMAL> "&"                     { PUSH_T(BIT_AND); }
-  <NORMAL> "<<"                    { PUSH_T(SHL); }
-  <NORMAL> ">>"                    { PUSH_T(SAR); }
-  <NORMAL> "+"                     { PUSH_T(ADD); }
-  <NORMAL> "-"                     { PUSH_T(SUB); }
-  <NORMAL> "*"                     { PUSH_T(MUL); }
-  <NORMAL> "/"                     { PUSH_T(DIV); }
-  <NORMAL> "%"                     { PUSH_T(MOD); }
+#define TOKENS \
+        TOK(EOS) \
+        TOK(LPAREN) \
+        TOK(RPAREN) \
+        TOK(LBRACK) \
+        TOK(RBRACK) \
+        TOK(LBRACE) \
+        TOK(RBRACE) \
+        TOK(COLON) \
+        TOK(SEMICOLON) \
+        TOK(PERIOD) \
+        TOK(CONDITIONAL) \
+        TOK(INC) \
+        TOK(DEC) \
+        TOK(ASSIGN) \
+        TOK(ASSIGN_BIT_OR) \
+        TOK(ASSIGN_BIT_XOR) \
+        TOK(ASSIGN_BIT_AND) \
+        TOK(ASSIGN_SHL) \
+        TOK(ASSIGN_SAR) \
+        TOK(ASSIGN_SHR) \
+        TOK(ASSIGN_ADD) \
+        TOK(ASSIGN_SUB) \
+        TOK(ASSIGN_MUL) \
+        TOK(ASSIGN_DIV) \
+        TOK(ASSIGN_MOD) \
+        TOK(COMMA) \
+        TOK(OR) \
+        TOK(AND) \
+        TOK(BIT_OR) \
+        TOK(BIT_XOR) \
+        TOK(BIT_AND) \
+        TOK(SHL) \
+        TOK(SAR) \
+        TOK(ADD) \
+        TOK(SUB) \
+        TOK(MUL) \
+        TOK(DIV) \
+        TOK(MOD) \
+        TOK(EQ) \
+        TOK(NE) \
+        TOK(EQ_STRICT) \
+        TOK(NE_STRICT) \
+        TOK(LT) \
+        TOK(GT) \
+        TOK(LTE) \
+        TOK(GTE) \
+        TOK(NOT) \
+        TOK(BIT_NOT) \

+// ----------------------------------------------------------------------
+static const char *tokenNames[] =
+{
+    #define TOK(x) #x,
+        TOKENS
+    #undef TOK
+};

-  <NORMAL> "=="                    { PUSH_T(EQ); }
-  <NORMAL> "!="                    { PUSH_T(NE); }
-  <NORMAL> "==="                   { PUSH_T(EQ_STRICT); }
-  <NORMAL> "!=="                   { PUSH_T(NE_STRICT); }
-  <NORMAL> "<"                     { PUSH_T(LT); }
-  <NORMAL> ">"                     { PUSH_T(GT); }
-  <NORMAL> "<="                    { PUSH_T(LTE); }
-  <NORMAL> ">="                    { PUSH_T(GTE); }
+// ----------------------------------------------------------------------
+class PushScanner
+{
+public:

+    enum Token
+    {
+        #define TOK(x) x,
+            TOKENS
+        #undef TOK
+    };

-  <NORMAL> "!"                     { PUSH_T(NOT); }
-  <NORMAL> "~"                     { PUSH_T(BIT_NOT); }
+private:

-  <NORMAL> line_terminator+        { PUSH_LINE_TERMINATOR(); }
+    bool        eof;
+    int32_t     state;
+    int32_t     condition;

-  <NORMAL> whitespace              {}
+    uint8_t     *limit;
+    uint8_t     *start;
+    uint8_t     *cursor;
+    uint8_t     *marker;

+    uint8_t     *buffer;
+    uint8_t     *bufferEnd;

-  <NORMAL> "//"                    :=> SINGLE_LINE_COMMENT
-  <NORMAL> "/*"                    :=> MULTILINE_COMMENT
-  <NORMAL> "<!--"                  :=> HTML_COMMENT
+    uint8_t     yych;
+    uint32_t    yyaccept;

+public:

-  <NORMAL> ["]                     :=> STRING
-  <NORMAL> [']                     :=> SINGLE_QUOTE_STRING
+ // ----------------------------------------------------------------------
+    PushScanner()
+    {
+        limit = 0;
+        start = 0;
+        state = -1;
+        condition = EConditionNormal;
+        cursor = 0;
+        marker = 0;
+        buffer = 0;
+        eof = false;
+        bufferEnd = 0;
+    }

+ // ----------------------------------------------------------------------
+    ~PushScanner()
+    {
+    }

-  <NORMAL> identifier_start        :=> IDENTIFIER
+ // ----------------------------------------------------------------------
+    void send(
+        Token token
+    )
+    {
+        size_t tokenSize = cursor-start;
+        const char *tokenName = tokenNames[token];
+        printf(
+            "scanner is pushing out a token of type %d (%s)",
+            token,
+            tokenName
+        );

-  <NORMAL> number_start            :=> NUMBER
+        if(token==EOS) putchar('\n');
+        else
+        {
+            size_t tokenNameSize = strlen(tokenNames[token]);
+            size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize);
+            for(size_t i=0; i<padSize; ++i) putchar(' ');
+            printf(" : ---->");

+            fwrite(
+                start,
+                tokenSize,
+                1,
+                stdout
+            );

-  <NORMAL> eof                     { PUSH_T(EOS); }
-  <NORMAL> any                     { TERMINATE_ILLEGAL(); }
+            printf("<----\n");
+        }
+    }

+ // ----------------------------------------------------------------------
+    uint32_t push(
+        const void  *input,
+        ssize_t     inputSize
+    )
+    {
+        printf(
+            "scanner is receiving a new data batch of length %ld\n"
+            "scanner continues with saved state = %d\n",
+            inputSize,
+            state
+        );

+        /*
+         * Data source is signaling end of file when batch size
+         * is less than maxFill. This is slightly annoying because
+         * maxFill is a value that can only be known after re2c does
+         * its thing. Practically though, maxFill is never bigger than
+         * the longest keyword, so given our grammar, 32 is a safe bet.
+         */
+        uint8_t null[64];
+        const ssize_t maxFill = 32;
+        if(inputSize<maxFill) // FIXME: do something about this!!!
+        {
+            eof = true;
+            input = null;
+            inputSize = sizeof(null);
+            memset(null, 0, sizeof(null));
+        }
+
+        /*
+         * When we get here, we have a partially
+         * consumed buffer which is in the following state:
+ * last valid char last valid buffer spot + * v v + * +-------------------+-------------+---------------+-------------+----------------------+ + * ^ ^ ^ ^ ^ ^ + * buffer start marker cursor limit bufferEnd
+         *
+ * We need to stretch the buffer and concatenate the new chunk of input to it
+         *
+         */
+        size_t used = limit-buffer;
+        size_t needed = used+inputSize;
+        size_t allocated = bufferEnd-buffer;
+        if(allocated<needed)
+        {
+            size_t limitOffset = limit-buffer;
+            size_t startOffset = start-buffer;
+            size_t markerOffset = marker-buffer;
+            size_t cursorOffset = cursor-buffer;
+
+                buffer = (uint8_t*)realloc(buffer, needed);
+                bufferEnd = needed+buffer;
+
+            marker = markerOffset + buffer;
+            cursor = cursorOffset + buffer;
+            start = buffer + startOffset;
+            limit = limitOffset + buffer;
+        }
+        memcpy(limit, input, inputSize);
+        limit += inputSize;
+
+        // The scanner starts here
+        #define YYLIMIT         limit
+        #define YYCURSOR        cursor
+        #define YYMARKER        marker
+        #define YYCTYPE         uint8_t
+
+ #define SKIP() { start = cursor; YYSETCONDITION(EConditionNormal); goto yy0; }
+        #define SEND(x)         { send(x); SKIP();          }
+        #define YYFILL(n)       { goto fill;                }
+
+        #define YYGETSTATE()    state
+        #define YYSETSTATE(x)   { state = (x);  }
+
+        #define YYGETCONDITION() condition
+        #define YYSETCONDITION(x) { condition = (x);  }
+
+    start:
+
+ printf("Starting a round; state: %d, condition: %d\n", state, condition);
+
+        /*!re2c
+        re2c:indent:top      = 1;
+        re2c:yych:conversion = 0;
+        re2c:condenumprefix          = ECondition;
+        re2c:define:YYCONDTYPE       = Condition;
+
+        eof = "\000";
+        any = [\000-\377];
+        whitespace_char = [ \t\v\f\r];
+        whitespace = whitespace_char+;
+        identifier_start = [$_\\a-zA-z];
+        identifier_char = [$_\\a-zA-z0-9];
+        number_start = [0-9];
+        number_char = [0-9\.e];
+        line_terminator = [\n\r]+;
+
+        <Normal> "("                     { PUSH_T(LPAREN); }
+        <Normal> ")"                     { PUSH_T(RPAREN); }
+        <Normal> "["                     { PUSH_T(LBRACK); }
+        <Normal> "]"                     { PUSH_T(RBRACK); }
+        <Normal> "{"                     { PUSH_T(LBRACE); }
+        <Normal> "}"                     { PUSH_T(RBRACE); }
+        <Normal> ":"                     { PUSH_T(COLON); }
+        <Normal> ";"                     { PUSH_T(SEMICOLON); }
+        <Normal> "."                     { PUSH_T(PERIOD); }
+        <Normal> "?"                     { PUSH_T(CONDITIONAL); }
+        <Normal> "++"                    { PUSH_T(INC); }
+        <Normal> "--"                    { PUSH_T(DEC); }
+
+        <Normal> "|="                    { PUSH_T(ASSIGN_BIT_OR); }
+        <Normal> "^="                    { PUSH_T(ASSIGN_BIT_XOR); }
+        <Normal> "&="                    { PUSH_T(ASSIGN_BIT_AND); }
+        <Normal> "<<="                   { PUSH_T(ASSIGN_SHL); }
+        <Normal> ">>="                   { PUSH_T(ASSIGN_SAR); }
+        <Normal> ">>>="                  { PUSH_T(ASSIGN_SHR); }
+        <Normal> "+="                    { PUSH_T(ASSIGN_ADD); }
+        <Normal> "-="                    { PUSH_T(ASSIGN_SUB); }
+        <Normal> "*="                    { PUSH_T(ASSIGN_MUL); }
+        <Normal> "/="                    { PUSH_T(ASSIGN_DIV); }
+        <Normal> "%="                    { PUSH_T(ASSIGN_MOD); }
+
+        <Normal> ","                     { PUSH_T(COMMA); }
+        <Normal> "||"                    { PUSH_T(OR); }
+        <Normal> "&&"                    { PUSH_T(AND); }
+        <Normal> "|"                     { PUSH_T(BIT_OR); }
+        <Normal> "^"                     { PUSH_T(BIT_XOR); }
+        <Normal> "&"                     { PUSH_T(BIT_AND); }
+        <Normal> "<<"                    { PUSH_T(SHL); }
+        <Normal> ">>"                    { PUSH_T(SAR); }
+        <Normal> "+"                     { PUSH_T(ADD); }
+        <Normal> "-"                     { PUSH_T(SUB); }
+        <Normal> "*"                     { PUSH_T(MUL); }
+        <Normal> "/"                     { PUSH_T(DIV); }
+        <Normal> "%"                     { PUSH_T(MOD); }
+
+        <Normal> "==="                   { PUSH_T(EQ_STRICT); }
+        <Normal> "=="                    { PUSH_T(EQ); }
+        <Normal> "!=="                   { PUSH_T(NE_STRICT); }
+        <Normal> "!="                    { PUSH_T(NE); }
+        <Normal> "<="                    { PUSH_T(LTE); }
+        <Normal> ">="                    { PUSH_T(GTE); }
+        <Normal> "<"                     { PUSH_T(LT); }
+        <Normal> ">"                     { PUSH_T(GT); }
+
+        <Normal> "="                     { PUSH_T(ASSIGN); }
+
+        <Normal> "!"                     { PUSH_T(NOT); }
+        <Normal> "~"                     { PUSH_T(BIT_NOT); }
+
+        <Normal> line_terminator+        { PUSH_LINE_TERMINATOR(); }
+        <Normal> whitespace              { SKIP();}
+
+        <Normal> "//"                    :=> SingleLineComment
+        <Normal> "/*"                    :=> MultiLineComment
+        <Normal> "<!--"                  :=> HtmlComment
+
+        <Normal> ["]                     :=> DoubleQuoteString
+        <Normal> [']                     :=> SingleQuoteString
+
+        <Normal> identifier_start        :=> Identifier
+        <Normal> number_start            :=> Number
+
+        <Normal> eof                     { PUSH_EOS(); return 1; }
+        <Normal> any                     { TERMINATE_ILLEGAL(); }
+
+        <DoubleQuoteString> "\\\""       {}
+        <DoubleQuoteString> ["]          { PUSH_STRING();}
+        <DoubleQuoteString> any          {}

-  <STRING> "\\\""                {}
-  <STRING> ["]                   { PUSH_STRING(); TRANSITION(NORMAL); }
-  <STRING> any                   {}
+        <SingleQuoteString> "\\'"    {}
+        <SingleQuoteString> "'"      { PUSH_STRING();}
+        <SingleQuoteString> any      {}

+        <Identifier> identifier_char+  {}
+        <Identifier> any               { PUSH_IDENTIFIER(); }

-  <SINGLE_QUOTE_STRING> "\\'"    {}
-  <SINGLE_QUOTE_STRING> "'"      { PUSH_STRING(); TRANSITION(NORMAL); }
-  <SINGLE_QUOTE_STRING> any      {}
+        <SingleLineComment> line_terminator
+                                       { PUSH_LINE_TERMINATOR();}

+        <SingleLineComment> any+     {}

+        <MultiLineComment> [*][//]      { PUSH_LINE_TERMINATOR();}
+        <MultiLineComment> eof { TERMINATE_ILLEGAL(); }
+        <MultiLineComment> any+       {}

-  <IDENTIFIER> identifier_char+  {}
-  <IDENTIFIER> any               { PUSH_IDENTIFIER(); TRANSITION(NORMAL); }
+        <HtmlComment> any+            {}
+        <HtmlComment> eof { TERMINATE_ILLEGAL(); }
+        <HtmlComment> "-->"           { }

+        <Number> number_char+          { }
+        <Number> any                   { PUSH_NUMBER(); }

+        */

-  <SINGLE_LINE_COMMENT> line_terminator
- { PUSH_LINE_TERMINATOR(); TRANSITION(NORMAL); }
+    fill:
+        ssize_t unfinishedSize = cursor-start;
+        printf(
+            "scanner needs a refill. Exiting for now with:\n"
+            "    saved fill state = %d\n"
+            "    unfinished token size = %ld\n",
+            state,
+            unfinishedSize
+        );

-  <SINGLE_LINE_COMMENT> any+     {}
+        if(0<unfinishedSize && start<limit)
+        {
+            printf("    unfinished token is: ");
+            fwrite(start, 1, cursor-start, stdout);
+            putchar('\n');
+        }
+        putchar('\n');

+        /*
+         * Once we get here, we can get rid of
+         * everything before start and after limit.
+         */
+        if(eof==true) goto start;
+        if(buffer<start)
+        {
+            size_t startOffset = start-buffer;
+            memmove(buffer, start, limit-start);
+            marker -= startOffset;
+            cursor -= startOffset;
+            limit -= startOffset;
+            start -= startOffset;
+        }
+        return 0;
+    }
+};

+// ----------------------------------------------------------------------
+int main(
+    int     argc,
+    char    **argv
+)
+{
+    // Parse cmd line
+    int input = 0;
+    if(1<argc)
+    {
+        input = open(argv[1], O_RDONLY | O_BINARY);
+        if(input<0)
+        {
+            fprintf(
+                stderr,
+                "could not open file %s\n",
+                argv[1]
+            );
+            exit(1);
+        }
+    }

- <MULTILINE_COMMENT> [*][//] { PUSH_LINE_TERMINATOR(); TRANSITION(NORMAL); }
-  <MULTILINE_COMMENT> eof { TERMINATE_ILLEGAL(); }
-  <MULTILINE_COMMENT> any+       {}
+    /*
+     * Tokenize input file by pushing batches
+     * of data one by one into the scanner.
+     */
+    const size_t batchSize = 256;
+    uint8_t buffer[batchSize];
+    PushScanner scanner;
+    while(1)
+    {
+        ssize_t n = read(input, buffer, batchSize);
+        if (scanner.push(buffer, n)) {
+          printf("Scanner: illegal data\n");
+          return 1;
+       }
+        if(n<batchSize) break;
+    }
+    scanner.push(0, -1);
+    close(input);

-*/
+    // Done
+    return 0;
+}

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to