Revision: 17568
Author:   [email protected]
Date:     Thu Nov  7 15:42:57 2013 UTC
Log: Experimental lexer generator: First draft of a Python lexer (based on the automata).

BUG=
[email protected]

Review URL: https://codereview.chromium.org/60663007
http://code.google.com/p/v8/source/detail?r=17568

Added:
 /branches/experimental/parser/tools/lexer_generator/lexer.py
 /branches/experimental/parser/tools/lexer_generator/lexer_test.py
Modified:
 /branches/experimental/parser/tools/lexer_generator/dfa.py
 /branches/experimental/parser/tools/lexer_generator/test_suite.py

=======================================
--- /dev/null
+++ /branches/experimental/parser/tools/lexer_generator/lexer.py Thu Nov 7 15:42:57 2013 UTC
@@ -0,0 +1,91 @@
+# Copyright 2013 the V8 project authors. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Google Inc. nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from nfa import Nfa, NfaBuilder
+from dfa import Dfa
+from rule_parser import RuleParser, RuleParserState
+
+# FIXME: We need to move this to a common place!
+def process_rules(parser_state):
+  dfas = {}
+  builder = NfaBuilder()
+  builder.set_character_classes(parser_state.character_classes)
+  for k, v in parser_state.rules.items():
+    graphs = []
+    for (graph, action) in v['regex']:
+      graphs.append(NfaBuilder.add_action(graph, action))
+    nfa = builder.nfa(NfaBuilder.or_graphs(graphs))
+    (start_name, dfa_nodes) = nfa.compute_dfa()
+    dfas[k] = Dfa(start_name, dfa_nodes)
+  return dfas
+
+# Lexes strings with the help of DFAs procuded by the grammar. For sanity
+# checking the automata.
+class Lexer(object):
+
+  def __init__(self, rules):
+    parser_state = RuleParserState()
+    RuleParser.parse(rules, parser_state)
+    self.dfas = process_rules(parser_state)
+
+  def lex(self, string):
+    dfa = self.dfas['default'] # FIXME
+
+    action_stream = []
+    terminate_seen = False
+    offset = 0
+    while not terminate_seen and string:
+      result = list(dfa.lex(string))
+      last_position = 0
+      for (action, position) in result:
+ action_stream.append((action[1], action[2], last_position + offset, position + 1 + offset, string[last_position:(position + 1)]))
+        last_position = position
+        if action[2] == 'terminate':
+          terminate_seen = True
+      string = string[(last_position + 1):]
+      offset += last_position
+    return action_stream
+
+if __name__ == '__main__':
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--rules')
+  parser.add_argument('--input')
+  args = parser.parse_args()
+
+  re_file = args.rules
+  input_file = args.input
+
+  with open(re_file, 'r') as f:
+    rules = f.read()
+  with open(input_file, 'r') as f:
+    input_text = f.read() + '\0'
+
+  lexer = Lexer(rules)
+  for t in lexer.lex(input_text):
+    print t
=======================================
--- /dev/null
+++ /branches/experimental/parser/tools/lexer_generator/lexer_test.py Thu Nov 7 15:42:57 2013 UTC
@@ -0,0 +1,67 @@
+# Copyright 2013 the V8 project authors. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Google Inc. nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import unittest
+from lexer import Lexer
+
+class LexerTestCase(unittest.TestCase):
+
+  def __verify_action_stream(self, stream, expected_stream):
+    for (ix, item) in enumerate(expected_stream):
+      self.assertEquals(stream[ix][0], item[0])
+      self.assertEquals(stream[ix][4], item[1])
+
+  def test_simple(self):
+    rules = '''
+    <default>
+    "("           { LBRACE }
+    ")"           { RBRACE }
+
+    "foo"         { FOO }
+    eof           <<terminate>>'''
+
+    lexer = Lexer(rules)
+
+    string = 'foo()\0'
+    self.__verify_action_stream(
+        lexer.lex(string),
+        [('FOO', 'foo'), ('LBRACE', '('), ('RBRACE', ')')])
+
+  def test_maximal_matching(self):
+    rules = '''
+    <default>
+    "<"           { LT }
+    "<<"          { SHL }
+    " "           { SPACE }
+    eof           <<terminate>>'''
+
+    lexer = Lexer(rules)
+
+    string = '<< <\0'
+    self.__verify_action_stream(
+        lexer.lex(string),
+        [('SHL', '<<'), ('SPACE', ' '), ('LT', '<')])
=======================================
--- /branches/experimental/parser/tools/lexer_generator/dfa.py Thu Nov 7 09:58:01 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/dfa.py Thu Nov 7 15:42:57 2013 UTC
@@ -98,6 +98,39 @@
     actions = list(self.collect_actions(string))
     return actions and actions[-1][0] == 'TERMINATE'

+  def lex(self, string):
+    state = self.__start
+    stored_action = None
+    pos = 0
+    while pos < len(string):
+      c = string[pos]
+ next = [s for k, s in state.transitions().items() if k.matches_char(c)]
+      if not next:
+ # Maybe we have a stored action before. Take it and backtrack to the
+        # position where that action was.
+        if stored_action:
+          yield stored_action
+          return
+ # FIXME: Otherwise, use the default rule if this happens at the start
+        # state of the automaton.
+
+      assert len(next) == 1
+      (state, action) = next[0]
+
+      # Special actions: terminate
+      if action and action[2] == 'terminate':
+        if stored_action:
+          yield stored_action
+        yield (action, pos)
+        return
+
+ # Normally we don't know yet whether to take the action - it depends on
+      # what comes next.
+      if action:
+        stored_action = (action, pos)
+
+      pos += 1
+
   def __visit_all_edges(self, visitor, state):
     edge = set([self.__start])
     first = lambda v: set([x[0] for x in v])
=======================================
--- /branches/experimental/parser/tools/lexer_generator/test_suite.py Thu Nov 7 09:17:36 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/test_suite.py Thu Nov 7 15:42:57 2013 UTC
@@ -29,6 +29,7 @@

 from action_test import *
 from automata_test import *
+from lexer_test import *
 from rule_parser_test import *
 from transition_key_test import *

@@ -39,6 +40,7 @@
     loader.loadTestsFromTestCase(AutomataTestCase),
     loader.loadTestsFromTestCase(RuleParserTestCase),
     loader.loadTestsFromTestCase(ActionTestCase),
+    loader.loadTestsFromTestCase(LexerTestCase),
   ))
   runner = TextTestRunner(verbosity = 2)
   runner.run(suite)

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to