Reviewers: marja,

Message:
Committed patchset #1 manually as r17992 (presubmit successful).

Description:
Experimental parser: add utf8 encoding

[email protected]

BUG=

Committed: https://code.google.com/p/v8/source/detail?r=17992

Please review this at https://codereview.chromium.org/82803003/

SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser

Affected files (+33, -6 lines):
  M tools/lexer_generator/automaton.py
  M tools/lexer_generator/code_generator.py
  M tools/lexer_generator/dfa.py
  M tools/lexer_generator/nfa.py
  M tools/lexer_generator/transition_keys.py


Index: tools/lexer_generator/automaton.py
diff --git a/tools/lexer_generator/automaton.py b/tools/lexer_generator/automaton.py index 7321fae5972459374fb9bd33d04a8031fb8f9a40..ddd59fed54e5971cdcc0d1c720d619ba567321f4 100644
--- a/tools/lexer_generator/automaton.py
+++ b/tools/lexer_generator/automaton.py
@@ -217,6 +217,8 @@ class Automaton(object):
       for key, state in node.key_state_iter():
         if key == TransitionKey.epsilon():
           key = "ε"
+        else:
+          key = key.to_string(self.encoding())
         edge_content.append("  S_%s -> S_%s [ label = \"%s\" ];" % (
             node.node_number(), state.node_number(), escape(key)))
       return (node_content, edge_content)
Index: tools/lexer_generator/code_generator.py
diff --git a/tools/lexer_generator/code_generator.py b/tools/lexer_generator/code_generator.py index a703890e076c193046d23f98ab84333bc8f23835..e6d44662908f0e8bc32decc2d3a006cddebaae0f 100644
--- a/tools/lexer_generator/code_generator.py
+++ b/tools/lexer_generator/code_generator.py
@@ -247,12 +247,9 @@ class CodeGenerator:
     template = template_env.get_template('code_generator.jinja')

     encoding = self.__dfa.encoding().name()
-    if encoding == 'latin1':
-      char_type = 'uint8_t'
-    elif encoding == 'utf16':
-      char_type = 'uint16_t'
-    else:
-      raise Exception('Unsupported encoding %s' % encoding)
+ char_types = {'latin1': 'uint8_t', 'utf16': 'uint16_t', 'utf8': 'int8_t'}
+    char_type = char_types[encoding]
+
     return template.render(
       start_node_number = 0,
       debug_print = self.__debug_print,
Index: tools/lexer_generator/dfa.py
diff --git a/tools/lexer_generator/dfa.py b/tools/lexer_generator/dfa.py
index b004e24d59ac9fbb766d12c1b8886126d6e4468b..f135b8e1e9d133298be386664c186365bb89a585 100644
--- a/tools/lexer_generator/dfa.py
+++ b/tools/lexer_generator/dfa.py
@@ -47,6 +47,7 @@ class DfaState(AutomatonState):
     return self.__action

   def add_transition(self, key, state):
+    assert key != None
     assert not key == TransitionKey.epsilon()
     assert not self.__transitions.has_key(key)
     self.__transitions[key] = state
Index: tools/lexer_generator/nfa.py
diff --git a/tools/lexer_generator/nfa.py b/tools/lexer_generator/nfa.py
index 4afb99f18b1eb8dccbe86a979c227239b224ce2a..43978c33be567605e029298d456b3ca3b452fccf 100644
--- a/tools/lexer_generator/nfa.py
+++ b/tools/lexer_generator/nfa.py
@@ -62,6 +62,7 @@ class NfaState(AutomatonState):
     return self.__transitions

   def __add_transition(self, key, next_state):
+    assert key != None
     if next_state == None:
       assert not self.is_closed(), "already closed"
       self.__unclosed.add(key)
Index: tools/lexer_generator/transition_keys.py
diff --git a/tools/lexer_generator/transition_keys.py b/tools/lexer_generator/transition_keys.py index 5401c2fe9b212a419f657f5b239e829ab003cb90..765970078549a7ebe89b0f2b08ed19ee60b9d2f7 100644
--- a/tools/lexer_generator/transition_keys.py
+++ b/tools/lexer_generator/transition_keys.py
@@ -37,6 +37,7 @@ class KeyEncoding(object):
     if not KeyEncoding.__encodings:
       Latin1Encoding()
       Utf16Encoding()
+      Utf8Encoding()
     return KeyEncoding.__encodings[name]

   def __init__(self, name, primary_range, class_names):
@@ -501,3 +502,28 @@ class Utf16Encoding(KeyEncoding):
       'identifier_part_not_letter',
       [(48, 57), (95, 95),
        self.class_range('non_latin_1_identifier_part_not_letter')])
+
+class Utf8Encoding(KeyEncoding):
+
+  def __init__(self):
+    super(Utf8Encoding, self).__init__(
+      'utf8',
+      (1, 127),
+      ['eos', 'zero', 'byte_order_mark',
+       'non_ascii_whitespace',
+       'non_ascii_letter',
+       'non_ascii_identifier_part_not_letter',
+       'non_ascii_line_terminator',
+       'non_ascii_everything_else'])
+    self.add_predefined_range(
+      'whitespace',
+ [(9, 9), (11, 12), (32, 32), self.class_range('non_ascii_whitespace')])
+    self.add_predefined_range(
+ 'letter', [(65, 90), (97, 122), self.class_range('non_ascii_letter')])
+    self.add_predefined_range(
+      'line_terminator',
+      [(10, 10), (13, 13), self.class_range('non_ascii_line_terminator')])
+    self.add_predefined_range(
+      'identifier_part_not_letter',
+      [(48, 57), (95, 95),
+       self.class_range('non_ascii_identifier_part_not_letter')])


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to