Revision: 17591
Author: [email protected]
Date: Fri Nov 8 12:36:06 2013 UTC
Log: Experimental lexer generator: Refactoring, merge Lexer and
Generator.
[email protected]
[email protected]
BUG=
Review URL: https://codereview.chromium.org/66613002
http://code.google.com/p/v8/source/detail?r=17591
Deleted:
/branches/experimental/parser/tools/lexer_generator/lexer.py
Modified:
/branches/experimental/parser/tools/lexer_generator/generator.py
/branches/experimental/parser/tools/lexer_generator/lexer_test.py
=======================================
--- /branches/experimental/parser/tools/lexer_generator/lexer.py Thu Nov 7
15:42:57 2013 UTC
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2013 the V8 project authors. All rights reserved.
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-# * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-from nfa import Nfa, NfaBuilder
-from dfa import Dfa
-from rule_parser import RuleParser, RuleParserState
-
-# FIXME: We need to move this to a common place!
-def process_rules(parser_state):
- dfas = {}
- builder = NfaBuilder()
- builder.set_character_classes(parser_state.character_classes)
- for k, v in parser_state.rules.items():
- graphs = []
- for (graph, action) in v['regex']:
- graphs.append(NfaBuilder.add_action(graph, action))
- nfa = builder.nfa(NfaBuilder.or_graphs(graphs))
- (start_name, dfa_nodes) = nfa.compute_dfa()
- dfas[k] = Dfa(start_name, dfa_nodes)
- return dfas
-
-# Lexes strings with the help of DFAs procuded by the grammar. For sanity
-# checking the automata.
-class Lexer(object):
-
- def __init__(self, rules):
- parser_state = RuleParserState()
- RuleParser.parse(rules, parser_state)
- self.dfas = process_rules(parser_state)
-
- def lex(self, string):
- dfa = self.dfas['default'] # FIXME
-
- action_stream = []
- terminate_seen = False
- offset = 0
- while not terminate_seen and string:
- result = list(dfa.lex(string))
- last_position = 0
- for (action, position) in result:
- action_stream.append((action[1], action[2], last_position +
offset, position + 1 + offset, string[last_position:(position + 1)]))
- last_position = position
- if action[2] == 'terminate':
- terminate_seen = True
- string = string[(last_position + 1):]
- offset += last_position
- return action_stream
-
-if __name__ == '__main__':
-
- parser = argparse.ArgumentParser()
- parser.add_argument('--rules')
- parser.add_argument('--input')
- args = parser.parse_args()
-
- re_file = args.rules
- input_file = args.input
-
- with open(re_file, 'r') as f:
- rules = f.read()
- with open(input_file, 'r') as f:
- input_text = f.read() + '\0'
-
- lexer = Lexer(rules)
- for t in lexer.lex(input_text):
- print t
=======================================
--- /branches/experimental/parser/tools/lexer_generator/generator.py Fri
Nov 8 11:52:04 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/generator.py Fri
Nov 8 12:36:06 2013 UTC
@@ -62,83 +62,117 @@
%s
</script>'''
-def generate_html(data):
- scripts = []
- loads = []
- for i, (name, nfa, dfa) in enumerate(data):
- if name == 'Normal': continue
- (nfa_i, dfa_i) = ("nfa_%d" % i, "dfa_%d" % i)
- scripts.append(script_template % (nfa_i, nfa.to_dot()))
- scripts.append(script_template % (dfa_i, dfa.to_dot()))
- loads.append(load_template % ("nfa [%s]" % name, nfa_i))
- loads.append(load_template % ("dfa [%s]" % name, dfa_i))
- body = "\n".join(scripts) + (load_outer_template % "\n".join(loads))
- return file_template % body
+class Generator(object):
-def process_rules(parser_state):
- rule_map = {}
- builder = NfaBuilder()
- builder.set_character_classes(parser_state.character_classes)
- assert 'default' in parser_state.rules
- def process(k, v):
- assert 'default' in v
- graphs = []
- for (graph, action) in v['regex']:
- (precedence, code, transition) = action
- if code:
- graph = NfaBuilder.add_action(graph, (precedence, code, None))
- if transition == 'continue':
- if not v['default'][1][2] == 'continue':
- graph = NfaBuilder.add_continue(graph)
+ def __init__(self, rules):
+ parser_state = RuleParserState()
+ RuleParser.parse(rules, parser_state)
+ self.__automata = {}
+ self.process_rules(parser_state)
+
+ def generate_html(self):
+ scripts = []
+ loads = []
+ for i, name in enumerate(self.__automata):
+ (nfa, dfa) = self.__automata[name]
+ if name == 'Normal': continue
+ (nfa_i, dfa_i) = ("nfa_%d" % i, "dfa_%d" % i)
+ scripts.append(script_template % (nfa_i, nfa.to_dot()))
+ scripts.append(script_template % (dfa_i, dfa.to_dot()))
+ loads.append(load_template % ("nfa [%s]" % name, nfa_i))
+ loads.append(load_template % ("dfa [%s]" % name, dfa_i))
+ body = "\n".join(scripts) + (load_outer_template % "\n".join(loads))
+ return file_template % body
+
+ def process_rules(self, parser_state):
+ rule_map = {}
+ builder = NfaBuilder()
+ builder.set_character_classes(parser_state.character_classes)
+ assert 'default' in parser_state.rules
+ def process(k, v):
+ assert 'default' in v
+ graphs = []
+ for (graph, action) in v['regex']:
+ (precedence, code, transition) = action
+ if code:
+ graph = NfaBuilder.add_action(graph, (precedence, code, None))
+ if transition == 'continue':
+ if not v['default'][1][2] == 'continue':
+ graph = NfaBuilder.add_continue(graph)
+ else:
+ pass # TODO null key
+ elif (transition == 'break' or
+ transition == 'terminate' or
+ transition == 'terminate_illegal'):
+ graph = NfaBuilder.add_action(graph, (10000, transition, None))
else:
- pass # TODO null key
- elif (transition == 'break' or
- transition == 'terminate' or
- transition == 'terminate_illegal'):
- graph = NfaBuilder.add_action(graph, (10000, transition, None))
- else:
- assert k == 'default'
- graph = NfaBuilder.join_subgraph(graph, transition,
rule_map[transition])
- graphs.append(graph)
- graph = NfaBuilder.or_graphs(graphs)
- # merge default action
- (precedence, code, transition) = v['default'][1]
- assert transition == 'continue' or transition == 'break'
- if transition == 'continue':
- assert k != 'default'
- graph = NfaBuilder.add_incoming_action(graph, (10000, k, None))
- if code:
- graph = NfaBuilder.add_incoming_action(graph, (precedence, code,
None))
- rule_map[k] = graph
- for k, v in parser_state.rules.items():
- if k == 'default': continue
- process(k, v)
- process('default', parser_state.rules['default'])
- html_data = []
- for rule_name, graph in rule_map.items():
- nfa = builder.nfa(graph)
- (start, dfa_nodes) = nfa.compute_dfa()
- dfa = Dfa(start, dfa_nodes)
- html_data.append((rule_name, nfa, dfa))
- return html_data
+ assert k == 'default'
+ graph = NfaBuilder.join_subgraph(graph, transition,
rule_map[transition])
+ graphs.append(graph)
+ graph = NfaBuilder.or_graphs(graphs)
+ # merge default action
+ (precedence, code, transition) = v['default'][1]
+ assert transition == 'continue' or transition == 'break'
+ if transition == 'continue':
+ assert k != 'default'
+ graph = NfaBuilder.add_incoming_action(graph, (10000, k, None))
+ if code:
+ graph = NfaBuilder.add_incoming_action(graph, (precedence, code,
None))
+ rule_map[k] = graph
+ for k, v in parser_state.rules.items():
+ if k == 'default': continue
+ process(k, v)
+ process('default', parser_state.rules['default'])
+ for rule_name, graph in rule_map.items():
+ nfa = builder.nfa(graph)
+ (start, dfa_nodes) = nfa.compute_dfa()
+ dfa = Dfa(start, dfa_nodes)
+ self.__automata[rule_name] = (nfa, dfa)
+
+ # Lexes strings with the help of DFAs procuded by the grammar. For sanity
+ # checking the automata.
+ def lex(self, string):
+ (nfa, dfa) = self.__automata['default'] # FIXME
+
+ action_stream = []
+ terminate_seen = False
+ offset = 0
+ while not terminate_seen and string:
+ result = list(dfa.lex(string))
+ last_position = 0
+ for (action, position) in result:
+ action_stream.append((action[1], action[2], last_position +
offset, position + 1 + offset, string[last_position:(position + 1)]))
+ last_position = position
+ if action[2] == 'terminate':
+ terminate_seen = True
+ string = string[(last_position + 1):]
+ offset += last_position
+ return action_stream
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--html')
parser.add_argument('--re', default='src/lexer/lexer_py.re')
+ parser.add_argument('--input')
args = parser.parse_args()
re_file = args.re
parser_state = RuleParserState()
print "parsing %s" % re_file
with open(re_file, 'r') as f:
- RuleParser.parse(f.read(), parser_state)
- html_data = process_rules(parser_state)
+ generator = Generator(f.read())
html_file = args.html
if html_file:
- html = generate_html(html_data)
+ html = generator.generate_html()
with open(args.html, 'w') as f:
f.write(html)
print "wrote html to %s" % html_file
+
+ input_file = args.input
+ if input_file:
+ with open(input_file, 'r') as f:
+ input_text = f.read() + '\0'
+ for t in generator.lex(input_text):
+ print t
=======================================
--- /branches/experimental/parser/tools/lexer_generator/lexer_test.py Fri
Nov 8 10:28:35 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/lexer_test.py Fri
Nov 8 12:36:06 2013 UTC
@@ -26,7 +26,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import unittest
-from lexer import Lexer
+from generator import Generator
class LexerTestCase(unittest.TestCase):
@@ -38,32 +38,34 @@
def test_simple(self):
rules = '''
<default>
- "(" { LBRACE }
- ")" { RBRACE }
+ "(" { LBRACE } <<continue>>
+ ")" { RBRACE } <<continue>>
- "foo" { FOO }
- eof <<terminate>>'''
+ "foo" { FOO } <<continue>>
+ eof <<terminate>>
+ default <<break>>'''
- lexer = Lexer(rules)
+ generator = Generator(rules)
string = 'foo()\0'
self.__verify_action_stream(
- lexer.lex(string),
+ generator.lex(string),
[('FOO', 'foo'), ('LBRACE', '('), ('RBRACE', ')')])
def test_maximal_matching(self):
rules = '''
<default>
- "<" { LT }
- "<<" { SHL }
- " " { SPACE }
- eof <<terminate>>'''
+ "<" { LT } <<continue>>
+ "<<" { SHL } <<continue>>
+ " " { SPACE } <<continue>>
+ eof <<terminate>>
+ default <<break>>'''
- lexer = Lexer(rules)
+ generator = Generator(rules)
string = '<< <\0'
self.__verify_action_stream(
- lexer.lex(string),
+ generator.lex(string),
[('SHL', '<<'), ('SPACE', ' '), ('LT', '<')])
@@ -75,12 +77,13 @@
digit = [0-9];
number = (digit+ ("." digit+)?);
<default>
- number { NUMBER }
- eof <<terminate>>'''
+ number { NUMBER } <<continue>>
+ eof <<terminate>>
+ default <<break>>'''
- lexer = Lexer(rules)
+ generator = Generator(rules)
string = '555\0'
self.__verify_action_stream(
- lexer.lex(string),
+ generator.lex(string),
[('NUMBER', '555')])
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.