Reviewers: marja,
Message:
Committed patchset #1 manually as r17992 (presubmit successful).
Description:
Experimental parser: add utf8 encoding
[email protected]
BUG=
Committed: https://code.google.com/p/v8/source/detail?r=17992
Please review this at https://codereview.chromium.org/82803003/
SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser
Affected files (+33, -6 lines):
M tools/lexer_generator/automaton.py
M tools/lexer_generator/code_generator.py
M tools/lexer_generator/dfa.py
M tools/lexer_generator/nfa.py
M tools/lexer_generator/transition_keys.py
Index: tools/lexer_generator/automaton.py
diff --git a/tools/lexer_generator/automaton.py
b/tools/lexer_generator/automaton.py
index
7321fae5972459374fb9bd33d04a8031fb8f9a40..ddd59fed54e5971cdcc0d1c720d619ba567321f4
100644
--- a/tools/lexer_generator/automaton.py
+++ b/tools/lexer_generator/automaton.py
@@ -217,6 +217,8 @@ class Automaton(object):
for key, state in node.key_state_iter():
if key == TransitionKey.epsilon():
key = "ε"
+ else:
+ key = key.to_string(self.encoding())
edge_content.append(" S_%s -> S_%s [ label = \"%s\" ];" % (
node.node_number(), state.node_number(), escape(key)))
return (node_content, edge_content)
Index: tools/lexer_generator/code_generator.py
diff --git a/tools/lexer_generator/code_generator.py
b/tools/lexer_generator/code_generator.py
index
a703890e076c193046d23f98ab84333bc8f23835..e6d44662908f0e8bc32decc2d3a006cddebaae0f
100644
--- a/tools/lexer_generator/code_generator.py
+++ b/tools/lexer_generator/code_generator.py
@@ -247,12 +247,9 @@ class CodeGenerator:
template = template_env.get_template('code_generator.jinja')
encoding = self.__dfa.encoding().name()
- if encoding == 'latin1':
- char_type = 'uint8_t'
- elif encoding == 'utf16':
- char_type = 'uint16_t'
- else:
- raise Exception('Unsupported encoding %s' % encoding)
+ char_types =
{'latin1': 'uint8_t', 'utf16': 'uint16_t', 'utf8': 'int8_t'}
+ char_type = char_types[encoding]
+
return template.render(
start_node_number = 0,
debug_print = self.__debug_print,
Index: tools/lexer_generator/dfa.py
diff --git a/tools/lexer_generator/dfa.py b/tools/lexer_generator/dfa.py
index
b004e24d59ac9fbb766d12c1b8886126d6e4468b..f135b8e1e9d133298be386664c186365bb89a585
100644
--- a/tools/lexer_generator/dfa.py
+++ b/tools/lexer_generator/dfa.py
@@ -47,6 +47,7 @@ class DfaState(AutomatonState):
return self.__action
def add_transition(self, key, state):
+ assert key != None
assert not key == TransitionKey.epsilon()
assert not self.__transitions.has_key(key)
self.__transitions[key] = state
Index: tools/lexer_generator/nfa.py
diff --git a/tools/lexer_generator/nfa.py b/tools/lexer_generator/nfa.py
index
4afb99f18b1eb8dccbe86a979c227239b224ce2a..43978c33be567605e029298d456b3ca3b452fccf
100644
--- a/tools/lexer_generator/nfa.py
+++ b/tools/lexer_generator/nfa.py
@@ -62,6 +62,7 @@ class NfaState(AutomatonState):
return self.__transitions
def __add_transition(self, key, next_state):
+ assert key != None
if next_state == None:
assert not self.is_closed(), "already closed"
self.__unclosed.add(key)
Index: tools/lexer_generator/transition_keys.py
diff --git a/tools/lexer_generator/transition_keys.py
b/tools/lexer_generator/transition_keys.py
index
5401c2fe9b212a419f657f5b239e829ab003cb90..765970078549a7ebe89b0f2b08ed19ee60b9d2f7
100644
--- a/tools/lexer_generator/transition_keys.py
+++ b/tools/lexer_generator/transition_keys.py
@@ -37,6 +37,7 @@ class KeyEncoding(object):
if not KeyEncoding.__encodings:
Latin1Encoding()
Utf16Encoding()
+ Utf8Encoding()
return KeyEncoding.__encodings[name]
def __init__(self, name, primary_range, class_names):
@@ -501,3 +502,28 @@ class Utf16Encoding(KeyEncoding):
'identifier_part_not_letter',
[(48, 57), (95, 95),
self.class_range('non_latin_1_identifier_part_not_letter')])
+
+class Utf8Encoding(KeyEncoding):
+
+ def __init__(self):
+ super(Utf8Encoding, self).__init__(
+ 'utf8',
+ (1, 127),
+ ['eos', 'zero', 'byte_order_mark',
+ 'non_ascii_whitespace',
+ 'non_ascii_letter',
+ 'non_ascii_identifier_part_not_letter',
+ 'non_ascii_line_terminator',
+ 'non_ascii_everything_else'])
+ self.add_predefined_range(
+ 'whitespace',
+ [(9, 9), (11, 12), (32, 32),
self.class_range('non_ascii_whitespace')])
+ self.add_predefined_range(
+ 'letter', [(65, 90), (97, 122),
self.class_range('non_ascii_letter')])
+ self.add_predefined_range(
+ 'line_terminator',
+ [(10, 10), (13, 13), self.class_range('non_ascii_line_terminator')])
+ self.add_predefined_range(
+ 'identifier_part_not_letter',
+ [(48, 57), (95, 95),
+ self.class_range('non_ascii_identifier_part_not_letter')])
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.