Revision: 18772
Author: [email protected]
Date: Thu Jan 23 11:39:04 2014 UTC
Log: Experimental parser: better eos handling
[email protected]
BUG=
Review URL: https://codereview.chromium.org/144603003
http://code.google.com/p/v8/source/detail?r=18772
Modified:
/branches/experimental/parser/src/lexer/lexer_py.re
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
/branches/experimental/parser/tools/lexer_generator/code_generator.py
/branches/experimental/parser/tools/lexer_generator/code_generator_test.py
/branches/experimental/parser/tools/lexer_generator/lexer_test.py
/branches/experimental/parser/tools/lexer_generator/nfa_builder.py
/branches/experimental/parser/tools/lexer_generator/rule_lexer.py
/branches/experimental/parser/tools/lexer_generator/rule_parser.py
/branches/experimental/parser/tools/lexer_generator/transition_keys.py
=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Thu Jan 23 08:24:55
2014 UTC
+++ /branches/experimental/parser/src/lexer/lexer_py.re Thu Jan 23 11:39:04
2014 UTC
@@ -51,7 +51,6 @@
non_octal_whole_part /(\.[:digit:]*)?/ maybe_exponent );
harmony_number = "0"[bBoO][:digit:]+;
line_terminator_sequence = /[:line_terminator:]|(\r\n|\n\r)/;
-eos = [:eos:];
# Rules.
@@ -197,7 +196,7 @@
identifier_start <|token(IDENTIFIER)|Identifier>
/\\u[:hex_digit:]{4}/ <check_escaped_identifier_start|token(IDENTIFIER)|
Identifier>
-eos <|terminate|>
+eos <terminate>
default_action <do_token_and_go_forward(ILLEGAL)>
<<DoubleQuoteString>>
@@ -211,8 +210,8 @@
"\\" <|token(ILLEGAL)|>
line_terminator <|token(ILLEGAL)|>
"\"" <|token(STRING)|>
-eos <|terminate_illegal|>
catch_all <||continue>
+eos <terminate_illegal>
<<SingleQuoteString>>
# TODO subgraph for '\'
@@ -226,8 +225,8 @@
"\\" <|token(ILLEGAL)|>
line_terminator <|token(ILLEGAL)|>
"'" <|token(STRING)|>
-eos <|terminate_illegal|>
catch_all <||continue>
+eos <terminate_illegal>
<<Identifier>>
identifier_char <|token(IDENTIFIER)|continue>
@@ -235,7 +234,7 @@
<<SingleLineComment>>
line_terminator <|line_terminator|>
-eos <|skip_and_terminate|>
+eos <skip_and_terminate>
catch_all <||continue>
<<MultiLineComment>>
@@ -243,5 +242,5 @@
# TODO find a way to generate the below rule
/\*+[^\/*]/ <||continue>
line_terminator <line_terminator_in_multiline_comment||continue>
-eos <|terminate_illegal|>
catch_all <||continue>
+eos <terminate_illegal>
=======================================
---
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
Thu Jan 23 10:16:42 2014 UTC
+++
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
Thu Jan 23 11:39:04 2014 UTC
@@ -237,14 +237,22 @@
{% endif -%}
{%- macro do_transition(jump_id) -%}
- {%- set transition_state_id = jump_table[jump_id][0] -%}
- {%- set inline_transition = jump_table[jump_id][1] == 'inline' %}
FORWARD();
- {%- if inline_transition %}
- {{ do_dfa_state(transition_state_id) }}
- {% else %}
- {{ jump(jump_id) }}
- {% endif %}
+ if (cursor_ < buffer_end_) {
+ {%- set transition_state_id = jump_table[jump_id][0] -%}
+ {%- set inline_transition = jump_table[jump_id][1] == 'inline' %}
+ {%- if inline_transition %}
+ {{ do_dfa_state(transition_state_id) }}
+ {% else %}
+ {{ jump(jump_id) }}
+ {% endif %}
+ }
+ {% if 'eos' in state['unique_transitions'] -%}
+ {{ jump(state['unique_transitions']['eos']) }} // eos handler
+ {%- else -%}
+ BACKWARD(1);
+ {{ jump(state['jump_before_match']) }} // no eos handler
+ {%- endif %}
{%- endmacro -%}
{%- if state['switch_transitions'] -%}
@@ -283,9 +291,10 @@
{% endfor -%}
}
{%- endif-%}
+
+ {{ write_label('before_match', node_number) }}
{%- set match_action = state.match_action -%}
-
{%- if match_action %}
{{ dispatch_match_action(match_action[0], match_action[1]) }}
CRASH();
@@ -317,8 +326,8 @@
}
#define READ_CURSOR() { \
- if (cursor_ >= buffer_end_) primary_char = 0; \
- else primary_char = *(cursor_); \
+ ASSERT(cursor_ < buffer_end_); \
+ primary_char = *(cursor_); \
}
#ifdef DEBUG
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.py
Thu Jan 23 10:16:42 2014 UTC
+++ /branches/experimental/parser/tools/lexer_generator/code_generator.py
Thu Jan 23 11:39:04 2014 UTC
@@ -54,7 +54,7 @@
self.__switching = switching
self.__jump_table = []
- __jump_labels = ['state_entry', 'after_entry_code']
+ __jump_labels = ['state_entry', 'after_entry_code', 'before_match']
@staticmethod
def __transform_state(encoding, state):
@@ -92,7 +92,6 @@
raise Exception()
if keys:
transitions.append((keys, transition_id))
- # eos_transitions is for a followup cl
return {
'node_number' : None,
'original_node_number' : state.node_number(),
@@ -129,9 +128,9 @@
def __set_inline(self, count, state):
assert state['inline'] == None
inline = False
- # inline terminal states
if state['must_not_inline']:
inline = False
+ # inline terminal states
elif not state['transitions']:
inline = True
# inline next to terminal states with 1 or 2 transitions
@@ -193,7 +192,7 @@
bom = 'byte_order_mark'
catch_all = 'non_primary_everything_else'
all_classes = set(encoding.class_name_iter())
- fast_classes = set(['eos', 'zero'])
+ fast_classes = set([])
call_classes = all_classes - fast_classes - set([bom, catch_all])
def remap_transition(class_name):
if class_name in call_classes:
@@ -258,6 +257,17 @@
for node_number in current_node['unique_transitions'].values():
CodeGenerator.__reorder(node_number, id_map, dfa_states)
+ @staticmethod
+ def __mark_eos_states(dfa_states, eos_states):
+ for state_id in eos_states:
+ state = dfa_states[state_id]
+ state['is_eos_handler'] = True
+ state['must_not_inline'] = True
+ assert state['match_action']
+ assert not state['entry_action']
+ assert not state['transitions']
+ assert not state['unique_transitions']
+
def __build_dfa_states(self):
dfa_states = []
self.__dfa.visit_all_states(lambda state, acc:
dfa_states.append(state))
@@ -267,13 +277,20 @@
id_map = {x['original_node_number'] : x for x in dfa_states}
dfa_states = []
CodeGenerator.__reorder(self.__start_node_number, id_map, dfa_states)
+ # store states
+ eos_states = set([])
def f((key, original_node_number)):
return (key, id_map[original_node_number]['node_number'])
for state in dfa_states:
state['transitions'] = map(f, state['transitions'])
+ state['unique_transitions'] = {k : id_map[v]['node_number']
+ for k, v in state['unique_transitions'].items()}
+ if 'eos' in state['unique_transitions']:
+ eos_states.add(state['unique_transitions']['eos'])
assert id_map[self.__start_node_number]['node_number'] == 0
assert len(dfa_states) == self.__dfa.node_count()
- # store states
+ # mark eos states
+ self.__mark_eos_states(dfa_states, eos_states)
self.__dfa_states = dfa_states
def __rewrite_gotos(self):
@@ -311,8 +328,6 @@
self.__dfa_states.append(state)
# mark inline as none, to be correctly handled below
state['inline'] = None
- # clear transitions, they shouldn't be used anymore
- state['transitions'] = []
# clear entry points
state['entry_points'] = {k : False for k in
CodeGenerator.__jump_labels}
return state['node_number']
@@ -350,6 +365,14 @@
return (key, self.__register_jump(target_id, jump_type))
for name in transition_names:
state[name] = map(generate_jump, state[name])
+ if 'eos' in state['unique_transitions']:
+ # eos state is not inlined
+ eos_state_id = state['unique_transitions']['eos']
+ jump = self.__register_jump(eos_state_id, 'state_entry')
+ state['unique_transitions']['eos'] = jump
+ elif state['transitions']:
+ jump = self.__register_jump(state_id, 'before_match')
+ state['jump_before_match'] = jump
# now rewrite all nodes created
nodes_created = len(inline_mapping) - len(inline_mapping_in)
assert len(self.__dfa_states) == (
=======================================
---
/branches/experimental/parser/tools/lexer_generator/code_generator_test.py
Tue Jan 14 12:22:39 2014 UTC
+++
/branches/experimental/parser/tools/lexer_generator/code_generator_test.py
Thu Jan 23 11:39:04 2014 UTC
@@ -33,12 +33,11 @@
def test_simple(self):
rules = '''
- eos = [:eos:];
<<default>>
"(" <|LBRACE|>
")" <|RBRACE|>
"foo" <|FOO|>
- eos <|terminate|>
+ eos <terminate>
default_action <DEFAULT>'''
CodeGenerator(RuleProcessor.parse(rules, 'latin1'))
=======================================
--- /branches/experimental/parser/tools/lexer_generator/lexer_test.py Tue
Jan 21 15:59:38 2014 UTC
+++ /branches/experimental/parser/tools/lexer_generator/lexer_test.py Thu
Jan 23 11:39:04 2014 UTC
@@ -48,13 +48,12 @@
def test_simple(self):
rules = '''
- eos = [:eos:];
<<default>>
"(" <|LBRACE|>
")" <|RBRACE|>
"foo" <|FOO|>
- eos <|terminate|>'''
+ eos <terminate>'''
string = 'foo()'
self.__verify_action_stream(rules, string,
@@ -62,12 +61,11 @@
def test_maximal_matching(self):
rules = '''
- eos = [:eos:];
<<default>>
"<" <|LT|>
"<<" <|SHL|>
" " <|SPACE|>
- eos <|terminate|>'''
+ eos <terminate>'''
string = '<< <'
self.__verify_action_stream(rules, string,
@@ -75,7 +73,6 @@
def test_consecutive_epsilon_transitions(self):
rules = '''
- eos = [:eos:];
digit = [0-9];
number = (digit+ ("." digit+)?);
<<default>>
=======================================
--- /branches/experimental/parser/tools/lexer_generator/nfa_builder.py Thu
Jan 23 09:11:01 2014 UTC
+++ /branches/experimental/parser/tools/lexer_generator/nfa_builder.py Thu
Jan 23 11:39:04 2014 UTC
@@ -251,6 +251,7 @@
keys = reduce(f, reachable_states, set())
keys.discard(TransitionKey.epsilon())
keys.discard(catch_all)
+ keys.discard(TransitionKey.unique('eos'))
inverse_key = TransitionKey.inverse_key(encoding, keys)
if inverse_key:
transitions[inverse_key] = transitions[catch_all]
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue
Jan 14 12:22:39 2014 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Thu
Jan 23 11:39:04 2014 UTC
@@ -31,6 +31,7 @@
tokens = (
'DEFAULT_ACTION',
+ 'EOS',
'CATCH_ALL',
'IDENTIFIER',
@@ -61,7 +62,7 @@
pass
__special_identifiers = set(map(lambda s: s.lower(),
- ['DEFAULT_ACTION', 'CATCH_ALL']))
+ ['DEFAULT_ACTION', 'CATCH_ALL', 'EOS']))
def t_IDENTIFIER(self, t):
r'[a-zA-Z0-9_]+'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser.py Wed
Jan 22 09:26:03 2014 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_parser.py Thu
Jan 23 11:39:04 2014 UTC
@@ -99,6 +99,7 @@
def p_transition_rule(self, p):
'''transition_rule : composite_regex action
| DEFAULT_ACTION default_action
+ | EOS eos
| CATCH_ALL action'''
precedence = RuleParser.__rule_precedence_counter
RuleParser.__rule_precedence_counter += 1
@@ -112,7 +113,7 @@
assert self.__state.current_state == 'default'
assert not rules['default_action']
rules['default_action'] = action
- elif p[1] == 'catch_all':
+ elif p[1] == 'eos' or p[1] == 'catch_all':
assert p[1] not in rules['uniques']
rules['uniques'][p[1]] = True
rules['regex'].append((NfaBuilder.unique_key(p[1]), precedence,
action))
@@ -128,6 +129,10 @@
'default_action : ACTION_OPEN identifier_action ACTION_CLOSE'
p[0] = (None, p[2], None)
+ def p_eos(self, p):
+ 'eos : ACTION_OPEN identifier_action ACTION_CLOSE'
+ p[0] = (None, p[2], None)
+
def p_maybe_identifier_action(self, p):
'''maybe_identifier_action : identifier_action
| empty'''
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Wed Jan 22 09:26:03 2014 UTC
+++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Thu Jan 23 11:39:04 2014 UTC
@@ -475,8 +475,8 @@
def __init__(self):
super(Latin1Encoding, self).__init__(
'latin1',
- (1, 255),
- ['eos', 'zero'])
+ (0, 255),
+ [])
self.add_predefined_range(
'whitespace', [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)])
self.add_predefined_range(
@@ -492,8 +492,8 @@
def __init__(self):
super(Utf16Encoding, self).__init__(
'utf16',
- (1, 255),
- ['eos', 'zero', 'byte_order_mark',
+ (0, 255),
+ ['byte_order_mark',
'non_primary_whitespace',
'non_primary_letter',
'non_primary_identifier_part_not_letter',
@@ -522,8 +522,8 @@
def __init__(self):
super(Utf8Encoding, self).__init__(
'utf8',
- (1, 127),
- ['eos', 'zero', 'byte_order_mark',
+ (0, 127),
+ ['byte_order_mark',
'non_primary_whitespace',
'non_primary_letter',
'non_primary_identifier_part_not_letter',
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.