Revision: 17452
Author:   [email protected]
Date:     Thu Oct 31 14:36:42 2013 UTC
Log:      Experimental parser: key inversion

[email protected]
BUG=

Review URL: https://codereview.chromium.org/54773002
http://code.google.com/p/v8/source/detail?r=17452

Modified:
 /branches/experimental/parser/tools/lexer_generator/automata_test.py
 /branches/experimental/parser/tools/lexer_generator/regex_lexer.py
 /branches/experimental/parser/tools/lexer_generator/regex_parser.py
 /branches/experimental/parser/tools/lexer_generator/transition_key_test.py
 /branches/experimental/parser/tools/lexer_generator/transition_keys.py

=======================================
--- /branches/experimental/parser/tools/lexer_generator/automata_test.py Thu Oct 31 12:54:57 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/automata_test.py Thu Oct 31 14:36:42 2013 UTC
@@ -31,9 +31,7 @@
 from dfa import Dfa

 def build_automata(string):
-  parser = RegexParser()
-  parser.build()
-  graph = parser.parse(string)
+  graph = RegexParser.parse(string)
   nfa = NfaBuilder().nfa(graph)
   (start_name, dfa_nodes, end_nodes) = nfa.compute_dfa()
   dfa = Dfa(start_name, dfa_nodes, end_nodes)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Wed Oct 30 15:12:52 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Thu Oct 31 14:36:42 2013 UTC
@@ -91,7 +91,7 @@
     t.value = t.value[1:]
     return t

-  t_class_CLASS_LITERAL = r'[a-zA-Z]' # fix this
+  t_class_CLASS_LITERAL = r'[a-zA-Z0-9]' # fix this

   t_ANY_ignore  = '\n'

=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_parser.py Thu Oct 31 08:19:46 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/regex_parser.py Thu Oct 31 14:36:42 2013 UTC
@@ -129,6 +129,8 @@
     self.lexer = RegexLexer()
     self.lexer.build(**kwargs)

-  def parse(self, data):
-    return self.parser.parse(data, lexer=self.lexer.lexer)
-
+  @staticmethod
+  def parse(data):
+    parser = RegexParser()
+    parser.build()
+    return parser.parser.parse(data, lexer=parser.lexer.lexer)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_key_test.py Thu Oct 31 11:01:22 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_key_test.py Thu Oct 31 14:36:42 2013 UTC
@@ -27,6 +27,7 @@

 import unittest
 from transition_keys import TransitionKey
+from regex_parser import RegexParser

 class TransitionKeyTestCase(unittest.TestCase):

@@ -44,5 +45,28 @@
     for (left, right) in self.__equal_pairs:
       self.assertEqual(hash(left), hash(right))

+  def test_classes(self):
+    # class regex, should match, should not match
+    data = [
+      ("1-2", "12", "ab"),
+      ("a-zA-Z", "abyzABYZ" , "123"),
+      ("a-zA-Z0g" , "abyzABYZ0" , "123"),
+    ]
+    for (string, match, no_match) in data:
+      for invert in [False, True]:
+        if invert:
+          regex = "[^%s]" % string
+          token = "NOT_CLASS"
+        else:
+          regex = "[%s]" % string
+          token = "CLASS"
+        graph = RegexParser.parse(regex)
+        assert graph[0] == token
+        key = TransitionKey.character_class(invert, graph[1])
+        for c in match:
+          self.assertEqual(invert, not key.matches_char(c))
+        for c in no_match:
+          self.assertEqual(invert, key.matches_char(c))
+
 if __name__ == '__main__':
     unittest.main()
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py Thu Oct 31 12:54:57 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py Thu Oct 31 14:36:42 2013 UTC
@@ -24,6 +24,7 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from string import printable

 class TransitionKey:
@@ -35,19 +36,26 @@
   __upper_bound = 257

   @staticmethod
-  def __verify_ranges(ranges):
- last = (TransitionKey.__lower_bound - 1, TransitionKey.__lower_bound - 1)
+  def __verify_ranges(ranges, check_merged):
+    assert ranges
+    last = None
     for r in ranges:
       assert TransitionKey.__lower_bound <= r[0]
       assert r[1] <= TransitionKey.__upper_bound
       assert r[0] <= r[1]
-      assert last[1] < r[0]
-      # TODO check classes are always disjoint points
+      r_is_class = (
+        r == TransitionKey.__unicode_whitespace_bounds or
+        r == TransitionKey.__unicode_literal_bounds)
+      if last != None and check_merged:
+        assert last[1] + 1 < r[0] or r_is_class
+      if (r[0] > TransitionKey.__latin_1_upper_bound or
+          r[1] > TransitionKey.__latin_1_upper_bound):
+        assert r_is_class
       last = r

   @staticmethod
   def __create(ranges):
-    TransitionKey.__verify_ranges(ranges)
+    TransitionKey.__verify_ranges(ranges, True)
     key = TransitionKey()
     key.__ranges = tuple(ranges) # immutable
     key.__cached_hash = None
@@ -57,7 +65,10 @@
   @staticmethod
   def epsilon():
     if (TransitionKey.__cached_epsilon == None):
-      TransitionKey.__cached_epsilon = TransitionKey.__create([])
+      key = TransitionKey()
+      key.__ranges = tuple([])
+      key.__cached_hash = None
+      TransitionKey.__cached_epsilon = key
     return TransitionKey.__cached_epsilon

   __cached_any = None
@@ -79,10 +90,24 @@
             char <= TransitionKey.__latin_1_upper_bound)
     return TransitionKey.__create([(char, char)])

+  @staticmethod
+  def __process_graph(graph, ranges):
+    key = graph[0]
+    if key == 'RANGE':
+      ranges.append((ord(graph[1]), ord(graph[2])))
+    elif key == 'LITERAL':
+      ranges.append((ord(graph[1]), ord(graph[1])))
+    elif key == 'CAT':
+      for x in [graph[1], graph[2]]:
+        TransitionKey.__process_graph(x, ranges)
+    else:
+      assert False, "bad key %s" % key
+
   @staticmethod
   def character_class(invert, graph):
-    # TODO
-    return TransitionKey.__create([(129, 129)])
+    ranges = []
+    TransitionKey.__process_graph(graph, ranges)
+    return TransitionKey.__key_from_ranges(invert, ranges)

   def matches_char(self, char):
     char = ord(char)
@@ -139,13 +164,7 @@
     return ", ".join(TransitionKey.__print_range(x) for x in self.__ranges)

   @staticmethod
-  def disjoint_keys(key_set):
-    range_map = {}
-    for x in key_set:
-      for r in x.__ranges:
-        if not r[0] in range_map:
-          range_map[r[0]] = []
-        range_map[r[0]].append(r[1])
+  def __disjoint_keys(range_map):
     sort = lambda x : sorted(set(x))
     range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))
     ranges = []
@@ -165,5 +184,76 @@
       if to_push:
         current = range_map[i + 1]
         range_map[i + 1] = (current[0], sort(current[1] + to_push))
-    TransitionKey.__verify_ranges(ranges)
+    return ranges
+
+  @staticmethod
+  def disjoint_keys(key_set):
+    if not key_set:
+      return []
+    range_map = {}
+    for x in key_set:
+      for r in x.__ranges:
+        if not r[0] in range_map:
+          range_map[r[0]] = []
+        range_map[r[0]].append(r[1])
+    ranges = TransitionKey.__disjoint_keys(range_map)
+    TransitionKey.__verify_ranges(ranges, False)
     return map(lambda x : TransitionKey.__create([x]), ranges)
+
+  @staticmethod
+  def __key_from_ranges(invert, ranges):
+    range_map = {}
+    for r in ranges:
+      if not r[0] in range_map:
+        range_map[r[0]] = []
+      range_map[r[0]].append(r[1])
+    ranges = TransitionKey.__disjoint_keys(range_map)
+    ranges = TransitionKey.__merge_ranges(ranges)
+    if invert:
+      ranges = TransitionKey.__invert_ranges(ranges)
+    return TransitionKey.__create(ranges)
+
+  @staticmethod
+  def __merge_ranges(ranges):
+    merged = []
+    last = None
+    for r in ranges:
+      if last == None:
+        last = r
+      elif (last[1] + 1 == r[0] and
+            r != TransitionKey.__unicode_whitespace_bounds and
+            r != TransitionKey.__unicode_literal_bounds):
+        last = (last[0], r[1])
+      else:
+        merged.append(last)
+        last = r
+    if last != None:
+      merged.append(last)
+    return merged
+
+  @staticmethod
+  def __invert_ranges(ranges):
+    inverted = []
+    last = None
+    contains_whitespace = False
+    contains_literal = False
+    for r in ranges:
+      if r == TransitionKey.__unicode_whitespace_bounds:
+        contains_whitespace = True
+        continue
+      if r == TransitionKey.__unicode_literal_bounds:
+        contains_literal = True
+        continue
+      if last == None:
+        if r[0] != TransitionKey.__lower_bound:
+          inverted.append((TransitionKey.__lower_bound, r[0] - 1))
+      elif last[1] + 1 < r[0]:
+        inverted.append((last[1] + 1, r[0] - 1))
+      last = r
+    if last != None and last[1] < TransitionKey.__latin_1_upper_bound:
+      inverted.append((last[1] + 1, TransitionKey.__latin_1_upper_bound))
+    if not contains_whitespace:
+      inverted.append(TransitionKey.__unicode_whitespace_bounds)
+    if not contains_literal:
+      inverted.append(TransitionKey.__unicode_literal_bounds)
+    return inverted

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to