Revision: 17568
Author: [email protected]
Date: Thu Nov 7 15:42:57 2013 UTC
Log: Experimental lexer generator: First draft of a Python lexer
(based on the automata).
BUG=
[email protected]
Review URL: https://codereview.chromium.org/60663007
http://code.google.com/p/v8/source/detail?r=17568
Added:
/branches/experimental/parser/tools/lexer_generator/lexer.py
/branches/experimental/parser/tools/lexer_generator/lexer_test.py
Modified:
/branches/experimental/parser/tools/lexer_generator/dfa.py
/branches/experimental/parser/tools/lexer_generator/test_suite.py
=======================================
--- /dev/null
+++ /branches/experimental/parser/tools/lexer_generator/lexer.py Thu Nov 7
15:42:57 2013 UTC
@@ -0,0 +1,91 @@
+# Copyright 2013 the V8 project authors. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from nfa import Nfa, NfaBuilder
+from dfa import Dfa
+from rule_parser import RuleParser, RuleParserState
+
+# FIXME: We need to move this to a common place!
+def process_rules(parser_state):
+ dfas = {}
+ builder = NfaBuilder()
+ builder.set_character_classes(parser_state.character_classes)
+ for k, v in parser_state.rules.items():
+ graphs = []
+ for (graph, action) in v['regex']:
+ graphs.append(NfaBuilder.add_action(graph, action))
+ nfa = builder.nfa(NfaBuilder.or_graphs(graphs))
+ (start_name, dfa_nodes) = nfa.compute_dfa()
+ dfas[k] = Dfa(start_name, dfa_nodes)
+ return dfas
+
+# Lexes strings with the help of DFAs procuded by the grammar. For sanity
+# checking the automata.
+class Lexer(object):
+
+ def __init__(self, rules):
+ parser_state = RuleParserState()
+ RuleParser.parse(rules, parser_state)
+ self.dfas = process_rules(parser_state)
+
+ def lex(self, string):
+ dfa = self.dfas['default'] # FIXME
+
+ action_stream = []
+ terminate_seen = False
+ offset = 0
+ while not terminate_seen and string:
+ result = list(dfa.lex(string))
+ last_position = 0
+ for (action, position) in result:
+ action_stream.append((action[1], action[2], last_position +
offset, position + 1 + offset, string[last_position:(position + 1)]))
+ last_position = position
+ if action[2] == 'terminate':
+ terminate_seen = True
+ string = string[(last_position + 1):]
+ offset += last_position
+ return action_stream
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--rules')
+ parser.add_argument('--input')
+ args = parser.parse_args()
+
+ re_file = args.rules
+ input_file = args.input
+
+ with open(re_file, 'r') as f:
+ rules = f.read()
+ with open(input_file, 'r') as f:
+ input_text = f.read() + '\0'
+
+ lexer = Lexer(rules)
+ for t in lexer.lex(input_text):
+ print t
=======================================
--- /dev/null
+++ /branches/experimental/parser/tools/lexer_generator/lexer_test.py Thu
Nov 7 15:42:57 2013 UTC
@@ -0,0 +1,67 @@
+# Copyright 2013 the V8 project authors. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import unittest
+from lexer import Lexer
+
+class LexerTestCase(unittest.TestCase):
+
+ def __verify_action_stream(self, stream, expected_stream):
+ for (ix, item) in enumerate(expected_stream):
+ self.assertEquals(stream[ix][0], item[0])
+ self.assertEquals(stream[ix][4], item[1])
+
+ def test_simple(self):
+ rules = '''
+ <default>
+ "(" { LBRACE }
+ ")" { RBRACE }
+
+ "foo" { FOO }
+ eof <<terminate>>'''
+
+ lexer = Lexer(rules)
+
+ string = 'foo()\0'
+ self.__verify_action_stream(
+ lexer.lex(string),
+ [('FOO', 'foo'), ('LBRACE', '('), ('RBRACE', ')')])
+
+ def test_maximal_matching(self):
+ rules = '''
+ <default>
+ "<" { LT }
+ "<<" { SHL }
+ " " { SPACE }
+ eof <<terminate>>'''
+
+ lexer = Lexer(rules)
+
+ string = '<< <\0'
+ self.__verify_action_stream(
+ lexer.lex(string),
+ [('SHL', '<<'), ('SPACE', ' '), ('LT', '<')])
=======================================
--- /branches/experimental/parser/tools/lexer_generator/dfa.py Thu Nov 7
09:58:01 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/dfa.py Thu Nov 7
15:42:57 2013 UTC
@@ -98,6 +98,39 @@
actions = list(self.collect_actions(string))
return actions and actions[-1][0] == 'TERMINATE'
+ def lex(self, string):
+ state = self.__start
+ stored_action = None
+ pos = 0
+ while pos < len(string):
+ c = string[pos]
+ next = [s for k, s in state.transitions().items() if
k.matches_char(c)]
+ if not next:
+ # Maybe we have a stored action before. Take it and backtrack to
the
+ # position where that action was.
+ if stored_action:
+ yield stored_action
+ return
+ # FIXME: Otherwise, use the default rule if this happens at the
start
+ # state of the automaton.
+
+ assert len(next) == 1
+ (state, action) = next[0]
+
+ # Special actions: terminate
+ if action and action[2] == 'terminate':
+ if stored_action:
+ yield stored_action
+ yield (action, pos)
+ return
+
+ # Normally we don't know yet whether to take the action - it depends
on
+ # what comes next.
+ if action:
+ stored_action = (action, pos)
+
+ pos += 1
+
def __visit_all_edges(self, visitor, state):
edge = set([self.__start])
first = lambda v: set([x[0] for x in v])
=======================================
--- /branches/experimental/parser/tools/lexer_generator/test_suite.py Thu
Nov 7 09:17:36 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/test_suite.py Thu
Nov 7 15:42:57 2013 UTC
@@ -29,6 +29,7 @@
from action_test import *
from automata_test import *
+from lexer_test import *
from rule_parser_test import *
from transition_key_test import *
@@ -39,6 +40,7 @@
loader.loadTestsFromTestCase(AutomataTestCase),
loader.loadTestsFromTestCase(RuleParserTestCase),
loader.loadTestsFromTestCase(ActionTestCase),
+ loader.loadTestsFromTestCase(LexerTestCase),
))
runner = TextTestRunner(verbosity = 2)
runner.run(suite)
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.