Revision: 17628
Author: [email protected]
Date: Tue Nov 12 07:12:31 2013 UTC
Log: Experimental parser: add catch all rule
[email protected]
BUG=
Review URL: https://codereview.chromium.org/69293005
http://code.google.com/p/v8/source/detail?r=17628
Modified:
/branches/experimental/parser/src/lexer/lexer_py.re
/branches/experimental/parser/tools/lexer_generator/automata_test.py
/branches/experimental/parser/tools/lexer_generator/dfa.py
/branches/experimental/parser/tools/lexer_generator/generator.py
/branches/experimental/parser/tools/lexer_generator/nfa.py
/branches/experimental/parser/tools/lexer_generator/regex_lexer.py
/branches/experimental/parser/tools/lexer_generator/rule_lexer.py
/branches/experimental/parser/tools/lexer_generator/rule_parser.py
/branches/experimental/parser/tools/lexer_generator/rule_parser_test.py
/branches/experimental/parser/tools/lexer_generator/transition_keys.py
=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Mon Nov 11 14:46:29
2013 UTC
+++ /branches/experimental/parser/src/lexer/lexer_py.re Tue Nov 12 07:12:31
2013 UTC
@@ -163,24 +163,22 @@
default_action { PUSH_TOKEN(ILLEGAL); }
<DoubleQuoteString>
-"\\" <<continue>>
-"\\\"" <<continue>>
-"\"" { PUSH_TOKEN(STRING); }
/\\\n\r?/ <<continue>>
/\\\r\n?/ <<continue>>
-/\n|\r/ { PUSH_TOKEN(ILLEGAL); }
+/\\./ <<continue>>
+/\n|\r/ { PUSH_TOKEN(ILLEGAL); }
+"\"" { PUSH_TOKEN(STRING); }
eof <<terminate_illegal>>
-/./ <<continue>>
+catch_all <<continue>>
<SingleQuoteString>
-"\\" <<continue>>
-"\\'" <<continue>>
-"'" { PUSH_TOKEN(STRING); }
/\\\n\r?/ <<continue>>
/\\\r\n?/ <<continue>>
-/\n|\r/ { PUSH_TOKEN(ILLEGAL); }
+/\\./ <<continue>>
+/\n|\r/ { PUSH_TOKEN(ILLEGAL); }
+"'" { PUSH_TOKEN(STRING); }
eof <<terminate_illegal>>
-/./ <<continue>>
+catch_all <<continue>>
<Identifier>
identifier_char <<continue>>
@@ -194,18 +192,21 @@
<SingleLineComment>
line_terminator { PUSH_LINE_TERMINATOR(); }
eof <<terminate>>
-/./ <<continue>>
+catch_all <<continue>>
<MultiLineComment>
"*/" { SKIP(); }
+/\*./ <<continue>>
# need to force action
line_terminator+ { PUSH_LINE_TERMINATOR(); } <<continue>>
eof <<terminate>>
-/./ <<continue>>
+catch_all <<continue>>
<HtmlComment>
"-->" { SKIP(); }
+/--./ <<continue>>
+/-./ <<continue>>
# need to force action
line_terminator+ { PUSH_LINE_TERMINATOR(); } <<continue>>
eof <<terminate>>
-/./ <<continue>>
+catch_all <<continue>>
=======================================
--- /branches/experimental/parser/tools/lexer_generator/automata_test.py
Thu Nov 7 12:55:33 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/automata_test.py
Tue Nov 12 07:12:31 2013 UTC
@@ -104,13 +104,9 @@
verify_hit("leftrightleftright", [l, r, l, r])
verify_miss("leftrightleftrightx", [l, r, l, r])
- def test_continue(self):
- graph = NfaBuilder.cat_graphs([
- NfaBuilder.cat_graphs([
- RegexParser.parse("ab"),
- NfaBuilder.add_continue(RegexParser.parse("cd"))]),
- RegexParser.parse("ef")])
- nfa = NfaBuilder().nfa(graph)
- self.assertFalse(nfa.matches(""))
- self.assertTrue(nfa.matches("abcdef"))
- self.assertTrue(nfa.matches("abcdabcdef"))
+ def test_minimization(self):
+ for (regex, matches, not_matches) in self.__test_data:
+ (nfa, dfa) = build_automata(regex)
+ dfa.minimize()
+
+
=======================================
--- /branches/experimental/parser/tools/lexer_generator/dfa.py Mon Nov 11
14:46:29 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/dfa.py Tue Nov 12
07:12:31 2013 UTC
@@ -156,6 +156,9 @@
state_iterator = lambda x : [x]
return self.generate_dot(self.__start, self.__terminal_set, iterator,
state_iterator)
+ def minimize(self):
+ pass
+
def to_code(self):
code = '''
char c = *cursor;
=======================================
--- /branches/experimental/parser/tools/lexer_generator/generator.py Mon
Nov 11 14:22:00 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/generator.py Tue
Nov 12 07:12:31 2013 UTC
@@ -91,19 +91,18 @@
assert 'default' in parser_state.rules
def process(k, v):
graphs = []
+ continues = 0
for (graph, (precedence, code, transition)) in v['regex']:
default_code = v['default_action']
action = code if code else default_code
if action:
graph = NfaBuilder.add_action(graph, (precedence, action))
- if not transition:
+ if not transition or transition == 'break':
pass
elif transition == 'continue':
assert not k == 'default'
+ continues += 1
graph = NfaBuilder.add_continue(graph)
- elif transition == 'break':
- assert code
- graph = NfaBuilder.add_break(graph)
elif (transition == 'terminate' or
transition == 'terminate_illegal'):
assert not code
@@ -114,6 +113,11 @@
graph = NfaBuilder.join_subgraph(
graph, transition, rule_map[transition], subgraph_modifier)
graphs.append(graph)
+ if continues == len(graphs):
+ graphs.append(NfaBuilder.epsilon())
+ if v['catch_all']:
+ assert v['catch_all'] == 'continue'
+ graphs.append(NfaBuilder.add_continue(NfaBuilder.catch_all()))
graph = NfaBuilder.or_graphs(graphs)
rule_map[k] = graph
# process first the subgraphs, then the default graph
=======================================
--- /branches/experimental/parser/tools/lexer_generator/nfa.py Mon Nov 11
12:28:48 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/nfa.py Tue Nov 12
07:12:31 2013 UTC
@@ -106,10 +106,27 @@
def key_matches(self, value):
return self.__matches(lambda k, v : k.matches_key(v), value)
+ def replace_catch_all(self):
+ catch_all = TransitionKey.unique('catch_all')
+ if not catch_all in self.__transitions:
+ return
+ f = lambda acc, state: acc | state.__epsilon_closure
+ reachable_states = reduce(f, self.__transitions[catch_all], set())
+ f = lambda acc, state: acc | set(state.__transitions.keys())
+ keys = reduce(f, reachable_states, set())
+ keys.discard(TransitionKey.epsilon())
+ keys.discard(catch_all)
+ inverse_key = TransitionKey.inverse_key(keys)
+ if inverse_key:
+ self.__transitions[inverse_key] = self.__transitions[catch_all]
+ del self.__transitions[catch_all]
+
@staticmethod
def gather_transition_keys(state_set):
f = lambda acc, state: acc | set(state.__transitions.keys())
- return TransitionKey.disjoint_keys(reduce(f, state_set, set()))
+ keys = reduce(f, state_set, set())
+ keys.discard(TransitionKey.epsilon())
+ return TransitionKey.disjoint_keys(keys)
class NfaBuilder:
@@ -202,6 +219,12 @@
def __any(self, graph):
return self.__key_state(TransitionKey.any())
+ def __epsilon(self, graph):
+ start = self.__new_state()
+ end = self.__new_state()
+ start.close(end)
+ return (start, [end])
+
def __action(self, graph):
(start, ends) = self.__process(graph[1])
action = graph[2]
@@ -215,19 +238,11 @@
state = self.__peek_state()
if not state['start_node']:
state['start_node'] = self.__new_state()
- end = self.__new_state()
- self.__patch_ends(ends, end)
- ends = [end]
- end.add_epsilon_transition(state['start_node'])
- return (start, ends)
+ self.__patch_ends(ends, state['start_node'])
+ return (start, [])
- def __break(self, graph):
- (start, ends) = self.__process(graph[1])
- self.__peek_state()['unpatched_ends'] += ends
- new_end = self.__new_state()
- for end in ends:
- end.add_epsilon_transition(new_end)
- return (start, [new_end])
+ def __catch_all(self, graph):
+ return self.__key_state(TransitionKey.unique('catch_all'))
def __join(self, graph):
(graph, name, subgraph, modifier) = graph[1:]
@@ -302,9 +317,13 @@
def add_continue(graph):
return ('CONTINUE', graph)
+ @staticmethod
+ def catch_all():
+ return ('CATCH_ALL',)
+
@staticmethod
- def add_break(graph):
- return ('BREAK', graph)
+ def epsilon():
+ return ('EPSILON',)
@staticmethod
def join_subgraph(graph, name, subgraph, modifier):
@@ -413,6 +432,7 @@
def compute_dfa(self):
self.__compute_epsilon_closures()
+ self.__visit_all_edges(lambda node, state: node.replace_catch_all(),
None)
dfa_nodes = {}
start_name = self.__to_dfa(set([self.__start]), dfa_nodes, self.__end)
return (start_name, dfa_nodes)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Fri
Nov 8 10:33:44 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Tue
Nov 12 07:12:31 2013 UTC
@@ -107,7 +107,7 @@
r'\\\d+'
return t
- __escaped_class_literals = build_escape_map("^[]-:")
+ __escaped_class_literals = build_escape_map("^[]-:\\")
def t_class_ESCAPED_CLASS_LITERAL(self, t):
r'\\.'
@@ -115,7 +115,7 @@
t.value = RegexLexer.__escaped_class_literals[t.value]
return t
- t_class_CLASS_LITERAL = r'[\w $_+]'
+ t_class_CLASS_LITERAL = r'[\w $_+\']'
def t_REPEAT_BEGIN(self, t):
r'\{'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Mon
Nov 11 07:52:15 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue
Nov 12 07:12:31 2013 UTC
@@ -32,6 +32,8 @@
tokens = (
'DEFAULT',
'DEFAULT_ACTION',
+ 'CATCH_ALL',
+
'IDENTIFIER',
'STRING',
'REGEX',
@@ -67,12 +69,13 @@
r'\#.*[\n\r]+'
pass
+ __special_identifiers = set(map(lambda s: s.lower(),
+ ['DEFAULT', 'DEFAULT_ACTION', 'CATCH_ALL',]))
+
def t_IDENTIFIER(self, t):
r'[a-zA-Z][a-zA-Z0-9_]*'
- if t.value == 'default':
- t.type = 'DEFAULT'
- if t.value == 'default_action':
- t.type = 'DEFAULT_ACTION'
+ if t.value in self.__special_identifiers:
+ t.type = t.value.upper()
return t
t_STRING = r'"((\\("|\w|\\))|[^\\"])+"'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser.py Mon
Nov 11 07:52:15 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_parser.py Tue
Nov 12 07:12:31 2013 UTC
@@ -86,6 +86,7 @@
if not state.current_state in state.rules:
state.rules[state.current_state] = {
'default_action': None,
+ 'catch_all' : None,
'regex' : []
}
p[0] = state.current_state
@@ -98,9 +99,10 @@
'''transition_rule : composite_regex code action
| composite_regex empty action
| composite_regex code empty
- | DEFAULT_ACTION code empty'''
- transition = p[3] if p[3] else 'break'
- if not transition in self.__keyword_transitions:
+ | DEFAULT_ACTION code empty
+ | CATCH_ALL empty action'''
+ transition = p[3]
+ if transition and not transition in self.__keyword_transitions:
assert not transition == 'default'
self.__state.transitions.add(transition)
RuleParser.__rule_precedence_counter += 1
@@ -109,6 +111,9 @@
if p[1] == 'default_action':
assert not rules['default_action']
rules['default_action'] = code
+ elif p[1] == 'catch_all':
+ assert not rules['catch_all']
+ rules['catch_all'] = transition
else:
rule = (p[1], (RuleParser.__rule_precedence_counter, code,
transition))
rules['regex'].append(rule)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser_test.py
Thu Nov 7 12:03:35 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_parser_test.py
Tue Nov 12 07:12:31 2013 UTC
@@ -53,19 +53,19 @@
self.assertTrue(len(self.state.rules), 2)
self.assertTrue('cond1' in self.state.rules)
- self.assertEquals(len(self.state.rules['cond1']), 2)
+ self.assertEquals(len(self.state.rules['cond1']['regex']), 2)
# self.assertTrue('regex2' in self.state.rules['cond1'])
# self.assertEquals(self.state.rules['cond1']['regex2'],
# ('condition', 'cond2'))
self.assertTrue('cond2' in self.state.rules)
- self.assertEquals(len(self.state.rules['cond2']), 2)
+ self.assertEquals(len(self.state.rules['cond2']['regex']), 2)
# self.assertTrue('regex3' in self.state.rules['cond2'])
# self.assertEquals(self.state.rules['cond2']['regex3'],
# ('body', 'body'))
self.assertTrue('cond3' in self.state.rules)
- self.assertEquals(len(self.state.rules['cond3']), 2)
+ self.assertEquals(len(self.state.rules['cond3']['regex']), 2)
# self.assertTrue('regex4' in self.state.rules['cond3'])
# self.assertEquals(self.state.rules['cond3']['regex4'],
# ('condition_and_body', 'cond4', 'body'))
@@ -79,7 +79,7 @@
self.assertEquals(self.state.aliases['alias'],
RegexParser.parse("regex;with;semicolon"))
self.assertTrue('cond1' in self.state.rules)
- self.assertEquals(len(self.state.rules['cond1']), 2)
+ self.assertEquals(len(self.state.rules['cond1']['regex']), 2)
# self.assertEquals(
# self.parse['cond1']['regex4{with{braces}'],
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Mon Nov 11 14:22:00 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Tue Nov 12 07:12:31 2013 UTC
@@ -41,6 +41,12 @@
__unique_key_counter = -1
+ @staticmethod
+ def alphabet_iter():
+ for k, (lower, upper) in TransitionKey.__class_bounds.items():
+ for i in range(lower, upper + 1):
+ yield i
+
@staticmethod
def __in_latin_1(char):
bound = TransitionKey.__class_bounds["latin_1"]
@@ -86,6 +92,9 @@
key.__cached_hash = None
return key
+ def __is_unique(self):
+ return len(self.__ranges) == 1 and
self.__is_unique_range(self.__ranges[0])
+
@staticmethod
def __cached_key(name, bounds_getter):
if not name in TransitionKey.__cached_keys:
@@ -109,7 +118,7 @@
return TransitionKey.__create([(char, char)])
@staticmethod
- def unique_key(name):
+ def unique(name):
def get_bounds(name):
bound = TransitionKey.__unique_key_counter
TransitionKey.__unique_key_counter -= 1
@@ -242,20 +251,34 @@
return ranges
@staticmethod
- def disjoint_keys(key_set):
- key_set.discard(TransitionKey.epsilon())
+ def __disjoint_ranges_from_key_set(key_set):
if not key_set:
return []
range_map = {}
for x in key_set:
+ assert not x.__is_unique()
+ assert x != TransitionKey.epsilon
for r in x.__ranges:
if not r[0] in range_map:
range_map[r[0]] = []
range_map[r[0]].append(r[1])
ranges = TransitionKey.__disjoint_keys(range_map)
TransitionKey.__verify_ranges(ranges, False)
+ return ranges
+
+ @staticmethod
+ def disjoint_keys(key_set):
+ ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)
return map(lambda x : TransitionKey.__create([x]), ranges)
+ @staticmethod
+ def inverse_key(key_set):
+ ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)
+ inverse = TransitionKey.__invert_ranges(ranges)
+ if not inverse:
+ return None
+ return TransitionKey.__create(inverse)
+
@staticmethod
def __key_from_ranges(invert, ranges):
range_map = {}
@@ -296,7 +319,8 @@
inverted = []
last = None
classes = set(TransitionKey.__class_bounds.values())
- classes.remove(TransitionKey.__class_bounds["latin_1"])
+ latin_1 = TransitionKey.__class_bounds["latin_1"]
+ classes.remove(latin_1)
for r in ranges:
assert not TransitionKey.__is_unique_range(r)
if TransitionKey.__is_class_range(r):
@@ -308,8 +332,10 @@
elif last[1] + 1 < r[0]:
inverted.append((last[1] + 1, r[0] - 1))
last = r
- upper_bound = TransitionKey.__class_bounds["latin_1"][1]
- if last != None and last[1] < upper_bound:
+ upper_bound = latin_1[1]
+ if last == None:
+ inverted.append(latin_1)
+ elif last[1] < upper_bound:
inverted.append((last[1] + 1, upper_bound))
inverted += list(classes)
return inverted
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.