Revision: 18014
Author: [email protected]
Date: Fri Nov 22 13:16:25 2013 UTC
Log: Experimental parser: cleanup code requiring calls to the unicode
cache
[email protected]
BUG=
Review URL: https://codereview.chromium.org/83243004
http://code.google.com/p/v8/source/detail?r=18014
Modified:
/branches/experimental/parser/src/lexer/lexer_py.re
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
/branches/experimental/parser/tools/lexer_generator/code_generator.py
/branches/experimental/parser/tools/lexer_generator/transition_keys.py
=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Fri Nov 22 08:51:22
2013 UTC
+++ /branches/experimental/parser/src/lexer/lexer_py.re Fri Nov 22 13:16:25
2013 UTC
@@ -68,21 +68,19 @@
"<!--" <||SingleLineComment>
"<!-" <|{
- cursor_ -= 2;
- yych = *(cursor_);
+ BACKWARD(2);
PUSH_TOKEN(Token::LT);
}|>
"<!" <|{
- cursor_ -= 1;
- yych = *(cursor_);
+ BACKWARD(1);
PUSH_TOKEN(Token::LT);
}|>
"-->" <{
if (!just_seen_line_terminator_) {
- yych = *(--cursor_);
+ BACKWARD(1);
PUSH_TOKEN(Token::DEC);
}
}||SingleLineComment>
@@ -134,7 +132,7 @@
"," <|push_token(COMMA)|>
line_terminator+ <|push_line_terminator|>
-/[:whitespace::byte_order_mark:]+/ <|skip|>
+/[:whitespace:]+/ <|skip|>
"\"" <set_marker(1)||DoubleQuoteString>
"'" <set_marker(1)||SingleQuoteString>
=======================================
---
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
Fri Nov 22 12:59:16 2013 UTC
+++
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
Fri Nov 22 13:16:25 2013 UTC
@@ -5,43 +5,32 @@
{%- if not loop.first %} || {% endif -%}
{%- if r[0] == 'PRIMARY_RANGE' -%}
{%- if r[1][0] == r[1][1] -%}
- yych == {{r[1][0]}}
+ primary_char == {{r[1][0]}}
{%- elif r[1][0] == 0 -%}
- yych <= {{r[1][1]}}
+ primary_char <= {{r[1][1]}}
{%- elif r[1][1] == upper_bound and not encoding == 'utf16'-%}
- yych >= {{r[1][0]}}
+ primary_char >= {{r[1][0]}}
{%- else -%}
- ({{r[1][0]}} <= yych && yych <= {{r[1][1]}})
+ ({{r[1][0]}} <= primary_char && primary_char <= {{r[1][1]}})
{%- endif -%}
{%- elif r[0] == 'CLASS' -%}
{%- if r[1] == 'eos' -%}
- (yych == 0 && cursor_ >= buffer_end_)
+ (primary_char == 0 && cursor_ >= buffer_end_)
{%- elif r[1] == 'zero' -%}
- (yych == 0 && cursor_ < buffer_end_)
- {%- elif encoding == 'latin1' -%}
- false /* {{r[1]}} */
- {%- elif encoding == 'utf16' -%}
- {%- if r[1] == 'byte_order_mark' -%}
- (yych == 0xfffe || yych == 0xfeff)
- {%- elif r[1] == 'non_latin_1_whitespace' -%}
- (yych > {{upper_bound}} && unicode_cache_->IsWhiteSpace(yych))
- {%- elif r[1] == 'non_latin_1_letter' -%}
- (yych > {{upper_bound}} && unicode_cache_->IsLetter(yych))
- {%- elif r[1] == 'non_latin_1_identifier_part_not_letter' -%}
- (yych > {{upper_bound}} &&
- unicode_cache_->IsIdentifierPartNotLetter(yych))
- {%- elif r[1] == 'non_latin_1_line_terminator' -%}
- (yych > {{upper_bound}} &&
unicode_cache_->IsLineTerminator(yych))
- {%- elif r[1] == 'non_latin_1_everything_else' -%}
- {# FIXME: Optimize this away #}
- (yych > {{upper_bound}} &&
- !unicode_cache_->IsWhiteSpace(yych) &&
- !unicode_cache_->IsLetter(yych) &&
- !unicode_cache_->IsIdentifierPartNotLetter(yych) &&
- !unicode_cache_->IsLineTerminator(yych))
- {%- else %}
- uncompilable code for {{encoding}} {{r[0]}} {{r[1]}}
- {%- endif -%}
+ (primary_char == 0 && cursor_ < buffer_end_)
+ {%- else %}
+ uncompilable code for {{encoding}} {{r[0]}} {{r[1]}}
+ {%- endif -%}
+ {# These classes require long_char and to be outside the primary range
#}
+ {%- elif r[0] == 'LONG_CHAR_CLASS' and encoding in ['utf16', 'utf8']
-%}
+ {%- if r[1] == 'byte_order_mark' -%}
+ (long_char == 0xfffe || long_char == 0xfeff)
+ {%- elif r[1] == 'call' -%}
+ unicode_cache_->{{r[2]}}(long_char)
+ {%- elif r[1] == 'invert' -%}
+ !({{do_key(r[2])}})
+ {%- elif r[1] == 'catch_all' -%}
+ (true || long_char == 0) /* {{r[1]}} */
{%- else -%}
uncompilable code for {{encoding}} {{r[0]}} {{r[1]}}
{%- endif -%}
@@ -58,7 +47,7 @@
{% elif type == 'terminate' %}
PUSH_EOS();
{% elif type == 'terminate_illegal' %}
- start_ = marker_; BACKWARD(); PUSH_TOKEN(Token::ILLEGAL);
+ start_ = marker_; BACKWARD(1); PUSH_TOKEN(Token::ILLEGAL);
{% elif type == 'skip' %}
SKIP();
{% elif type == 'skip_and_terminate' %}
@@ -112,7 +101,7 @@
{%- endif %}
{%- if debug_print %}
- fprintf(stderr, "char at hand is %c (%d)\n", yych, yych);
+ fprintf(stderr, "char at hand is %c (%d)\n", primary_char,
primary_char);
{% endif -%}
{%- macro do_transition(transition_state_id) -%}
@@ -126,7 +115,7 @@
{%- endmacro -%}
{%- if state['switch_transitions'] -%}
- switch(yych) {
+ switch(primary_char) {
{%- for ranges, transition_state_id in state['switch_transitions'] %}
{%- for r in ranges -%}
{%- for key in range(r[0], r[1] + 1) -%}
@@ -139,7 +128,7 @@
{%- endif -%}
{%- for key, transition_state_id in state.transitions %}
- if ({{do_key(key)}}) {
+ if ({{do_key(key)}}) { // normal if transition
{{ do_transition(transition_state_id) }}
}
{% endfor -%}
@@ -149,6 +138,18 @@
{{ do_transition(transition_state_id) }}
}
{% endfor -%}
+
+ {%- if state['long_char_transitions'] -%}
+ {# TODO macro this up for utf8 #}
+ if (primary_char > {{upper_bound}}) {
+ uint32_t long_char = primary_char;
+ {%- for key, transition_state_id in state['long_char_transitions'] %}
+ if ({{do_key(key)}}) { // long_char transition
+ {{ do_transition(transition_state_id) }}
+ }
+ {% endfor -%}
+ }
+ {%- endif-%}
{%- set match_action = state.match_action -%}
@@ -189,14 +190,15 @@
just_seen_line_terminator_ = true; \
}
-#define FORWARD() { \
- if (++cursor_ >= buffer_end_) yych = 0; \
- else yych = *(cursor_); \
+#define FORWARD() { \
+ if (++cursor_ >= buffer_end_) primary_char = 0; \
+ else primary_char = *(cursor_); \
}
-#define BACKWARD() { \
- if (--cursor_ >= buffer_end_) yych = 0; \
- else yych = *(cursor_); \
+#define BACKWARD(n) { \
+ cursor_ -= n; \
+ if (cursor_ >= buffer_end_) primary_char = 0; \
+ else primary_char = *(cursor_); \
}
#define SKIP() { \
@@ -209,9 +211,9 @@
Token::Value ExperimentalScanner<{{char_type}}>::Next(int*
beg_pos_to_return,
int*
end_pos_to_return) {
// Setup environment.
- {{char_type}} yych;
- if (cursor_ >= buffer_end_) yych = 0;
- else yych = *(cursor_);
+ {{char_type}} primary_char;
+ if (cursor_ >= buffer_end_) primary_char = 0;
+ else primary_char = *(cursor_);
{# first node is start node #}
{% for dfa_state in dfa_states -%}
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.py
Fri Nov 22 09:25:13 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/code_generator.py
Fri Nov 22 13:16:25 2013 UTC
@@ -129,6 +129,7 @@
'transitions' : transitions,
'switch_transitions' : [],
'deferred_transitions' : [],
+ 'long_char_transitions' : [],
'disjoint_keys' : disjoint_keys,
'inline' : None,
'depth' : None,
@@ -198,6 +199,75 @@
state['deferred_transitions'] = deferred_transitions
return split_count + (0 if no_switch else 1)
+ __call_map = {
+ 'non_primary_whitespace' : 'IsWhiteSpace',
+ 'non_primary_letter' : 'IsLetter',
+ 'non_primary_identifier_part_not_letter' : 'IsIdentifierPartNotLetter',
+ 'non_primary_line_terminator' : 'IsLineTerminator',
+ }
+
+ def __rewrite_deferred_transitions(self, state):
+ assert not state['long_char_transitions']
+ transitions = state['deferred_transitions']
+ if not transitions:
+ return
+ encoding = self.__dfa.encoding()
+ bom = 'byte_order_mark'
+ catch_all = 'non_primary_everything_else'
+ all_classes = set(encoding.class_name_iter())
+ fast_classes = set(['eos', 'zero'])
+ call_classes = all_classes - fast_classes - set([bom, catch_all])
+ def remap_transition(class_name):
+ if class_name in call_classes:
+ return ('LONG_CHAR_CLASS', 'call', self.__call_map[class_name])
+ if class_name == bom:
+ return ('LONG_CHAR_CLASS', class_name)
+ raise Exception(class_name)
+ fast_transitions = []
+ long_class_transitions = []
+ long_class_map = {}
+ catchall_transition = None
+ # loop through and remove catch_all_transitions
+ for (classes, transition_node_id) in transitions:
+ ft = []
+ lct = []
+ has_catch_all = False
+ for (class_type, class_name) in classes:
+ if class_name in fast_classes:
+ ft.append((class_type, class_name))
+ else:
+ assert not class_name in long_class_map
+ long_class_map[class_name] = transition_node_id
+ if class_name == catch_all:
+ assert not has_catch_all
+ assert catchall_transition == None
+ has_catch_all = True
+ else:
+ lct.append(remap_transition(class_name))
+ if ft:
+ fast_transitions.append((ft, transition_node_id))
+ if has_catch_all:
+ catchall_transition = (lct, transition_node_id)
+ elif lct:
+ long_class_transitions.append((lct, transition_node_id))
+ # all transitions are fast
+ if not long_class_map:
+ return
+ if catchall_transition:
+ catchall_transitions = all_classes - fast_classes
+ for class_name in long_class_map.iterkeys():
+ catchall_transitions.remove(class_name)
+ assert not catchall_transitions, "class inversion not unimplemented"
+ # split deferred transitions
+ state['deferred_transitions'] = fast_transitions
+ if catchall_transition:
+ catchall_transition = [
+ ([('LONG_CHAR_CLASS', 'catch_all')], catchall_transition[1])]
+ else:
+ catchall_transition = []
+ state['long_char_transitions'] = (long_class_transitions +
+ catchall_transition) # must be last
+
def __canonicalize_traversal(self):
dfa_states = []
self.__dfa.visit_all_states(lambda state, acc:
dfa_states.append(state))
@@ -229,6 +299,9 @@
switched = reduce(self.__split_transitions, dfa_states, 0)
if self.__log:
print "%s states use switch (instead of if)" % switched
+ # rewrite deferred transitions
+ for state in dfa_states:
+ self.__rewrite_deferred_transitions(state)
def process(self):
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Fri Nov 22 09:25:13 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Fri Nov 22 13:16:25 2013 UTC
@@ -57,7 +57,6 @@
return self.__name
def add_predefined_range(self, name, ranges):
- # TODO verify disjointness
self.__predefined_ranges[name] = ranges
def lower_bound(self):
@@ -76,6 +75,9 @@
def class_range_iter(self):
return self.__class_ranges.iteritems()
+ def class_name_iter(self):
+ return self.__class_ranges.iterkeys()
+
def class_value_iter(self):
return self.__class_ranges.itervalues()
@@ -463,7 +465,7 @@
super(Latin1Encoding, self).__init__(
'latin1',
(1, 255),
- ['eos', 'zero', 'byte_order_mark'])
+ ['eos', 'zero'])
self.add_predefined_range(
'whitespace', [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)])
self.add_predefined_range(
@@ -481,27 +483,28 @@
'utf16',
(1, 255),
['eos', 'zero', 'byte_order_mark',
- 'non_latin_1_whitespace',
- 'non_latin_1_letter',
- 'non_latin_1_identifier_part_not_letter',
- 'non_latin_1_line_terminator',
- 'non_latin_1_everything_else'])
+ 'non_primary_whitespace',
+ 'non_primary_letter',
+ 'non_primary_identifier_part_not_letter',
+ 'non_primary_line_terminator',
+ 'non_primary_everything_else'])
self.add_predefined_range(
'whitespace',
[(9, 9), (11, 12), (32, 32), (133, 133), (160, 160),
- self.class_range('non_latin_1_whitespace')])
+ self.class_range('byte_order_mark'),
+ self.class_range('non_primary_whitespace')])
self.add_predefined_range(
'letter', [
(65, 90), (97, 122), (170, 170), (181, 181),
(186, 186), (192, 214), (216, 246), (248, 255),
- self.class_range('non_latin_1_letter')])
+ self.class_range('non_primary_letter')])
self.add_predefined_range(
'line_terminator',
- [(10, 10), (13, 13),
self.class_range('non_latin_1_line_terminator')])
+ [(10, 10), (13, 13),
self.class_range('non_primary_line_terminator')])
self.add_predefined_range(
'identifier_part_not_letter',
[(48, 57), (95, 95),
- self.class_range('non_latin_1_identifier_part_not_letter')])
+ self.class_range('non_primary_identifier_part_not_letter')])
class Utf8Encoding(KeyEncoding):
@@ -510,20 +513,22 @@
'utf8',
(1, 127),
['eos', 'zero', 'byte_order_mark',
- 'non_ascii_whitespace',
- 'non_ascii_letter',
- 'non_ascii_identifier_part_not_letter',
- 'non_ascii_line_terminator',
- 'non_ascii_everything_else'])
+ 'non_primary_whitespace',
+ 'non_primary_letter',
+ 'non_primary_identifier_part_not_letter',
+ 'non_primary_line_terminator',
+ 'non_primary_everything_else'])
self.add_predefined_range(
'whitespace',
- [(9, 9), (11, 12), (32, 32),
self.class_range('non_ascii_whitespace')])
+ [(9, 9), (11, 12), (32, 32),
+ self.class_range('byte_order_mark'),
+ self.class_range('non_primary_whitespace')])
self.add_predefined_range(
- 'letter', [(65, 90), (97, 122),
self.class_range('non_ascii_letter')])
+ 'letter', [(65, 90), (97, 122),
self.class_range('non_primary_letter')])
self.add_predefined_range(
'line_terminator',
- [(10, 10), (13, 13), self.class_range('non_ascii_line_terminator')])
+ [(10, 10), (13, 13),
self.class_range('non_primary_line_terminator')])
self.add_predefined_range(
'identifier_part_not_letter',
[(48, 57), (95, 95),
- self.class_range('non_ascii_identifier_part_not_letter')])
+ self.class_range('non_primary_identifier_part_not_letter')])
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.