Revision: 17452
Author: [email protected]
Date: Thu Oct 31 14:36:42 2013 UTC
Log: Experimental parser: key inversion
[email protected]
BUG=
Review URL: https://codereview.chromium.org/54773002
http://code.google.com/p/v8/source/detail?r=17452
Modified:
/branches/experimental/parser/tools/lexer_generator/automata_test.py
/branches/experimental/parser/tools/lexer_generator/regex_lexer.py
/branches/experimental/parser/tools/lexer_generator/regex_parser.py
/branches/experimental/parser/tools/lexer_generator/transition_key_test.py
/branches/experimental/parser/tools/lexer_generator/transition_keys.py
=======================================
--- /branches/experimental/parser/tools/lexer_generator/automata_test.py
Thu Oct 31 12:54:57 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/automata_test.py
Thu Oct 31 14:36:42 2013 UTC
@@ -31,9 +31,7 @@
from dfa import Dfa
def build_automata(string):
- parser = RegexParser()
- parser.build()
- graph = parser.parse(string)
+ graph = RegexParser.parse(string)
nfa = NfaBuilder().nfa(graph)
(start_name, dfa_nodes, end_nodes) = nfa.compute_dfa()
dfa = Dfa(start_name, dfa_nodes, end_nodes)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Wed
Oct 30 15:12:52 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Thu
Oct 31 14:36:42 2013 UTC
@@ -91,7 +91,7 @@
t.value = t.value[1:]
return t
- t_class_CLASS_LITERAL = r'[a-zA-Z]' # fix this
+ t_class_CLASS_LITERAL = r'[a-zA-Z0-9]' # fix this
t_ANY_ignore = '\n'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_parser.py Thu
Oct 31 08:19:46 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/regex_parser.py Thu
Oct 31 14:36:42 2013 UTC
@@ -129,6 +129,8 @@
self.lexer = RegexLexer()
self.lexer.build(**kwargs)
- def parse(self, data):
- return self.parser.parse(data, lexer=self.lexer.lexer)
-
+ @staticmethod
+ def parse(data):
+ parser = RegexParser()
+ parser.build()
+ return parser.parser.parse(data, lexer=parser.lexer.lexer)
=======================================
---
/branches/experimental/parser/tools/lexer_generator/transition_key_test.py
Thu Oct 31 11:01:22 2013 UTC
+++
/branches/experimental/parser/tools/lexer_generator/transition_key_test.py
Thu Oct 31 14:36:42 2013 UTC
@@ -27,6 +27,7 @@
import unittest
from transition_keys import TransitionKey
+from regex_parser import RegexParser
class TransitionKeyTestCase(unittest.TestCase):
@@ -44,5 +45,28 @@
for (left, right) in self.__equal_pairs:
self.assertEqual(hash(left), hash(right))
+ def test_classes(self):
+ # class regex, should match, should not match
+ data = [
+ ("1-2", "12", "ab"),
+ ("a-zA-Z", "abyzABYZ" , "123"),
+ ("a-zA-Z0g" , "abyzABYZ0" , "123"),
+ ]
+ for (string, match, no_match) in data:
+ for invert in [False, True]:
+ if invert:
+ regex = "[^%s]" % string
+ token = "NOT_CLASS"
+ else:
+ regex = "[%s]" % string
+ token = "CLASS"
+ graph = RegexParser.parse(regex)
+ assert graph[0] == token
+ key = TransitionKey.character_class(invert, graph[1])
+ for c in match:
+ self.assertEqual(invert, not key.matches_char(c))
+ for c in no_match:
+ self.assertEqual(invert, key.matches_char(c))
+
if __name__ == '__main__':
unittest.main()
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Thu Oct 31 12:54:57 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Thu Oct 31 14:36:42 2013 UTC
@@ -24,6 +24,7 @@
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
from string import printable
class TransitionKey:
@@ -35,19 +36,26 @@
__upper_bound = 257
@staticmethod
- def __verify_ranges(ranges):
- last = (TransitionKey.__lower_bound - 1, TransitionKey.__lower_bound -
1)
+ def __verify_ranges(ranges, check_merged):
+ assert ranges
+ last = None
for r in ranges:
assert TransitionKey.__lower_bound <= r[0]
assert r[1] <= TransitionKey.__upper_bound
assert r[0] <= r[1]
- assert last[1] < r[0]
- # TODO check classes are always disjoint points
+ r_is_class = (
+ r == TransitionKey.__unicode_whitespace_bounds or
+ r == TransitionKey.__unicode_literal_bounds)
+ if last != None and check_merged:
+ assert last[1] + 1 < r[0] or r_is_class
+ if (r[0] > TransitionKey.__latin_1_upper_bound or
+ r[1] > TransitionKey.__latin_1_upper_bound):
+ assert r_is_class
last = r
@staticmethod
def __create(ranges):
- TransitionKey.__verify_ranges(ranges)
+ TransitionKey.__verify_ranges(ranges, True)
key = TransitionKey()
key.__ranges = tuple(ranges) # immutable
key.__cached_hash = None
@@ -57,7 +65,10 @@
@staticmethod
def epsilon():
if (TransitionKey.__cached_epsilon == None):
- TransitionKey.__cached_epsilon = TransitionKey.__create([])
+ key = TransitionKey()
+ key.__ranges = tuple([])
+ key.__cached_hash = None
+ TransitionKey.__cached_epsilon = key
return TransitionKey.__cached_epsilon
__cached_any = None
@@ -79,10 +90,24 @@
char <= TransitionKey.__latin_1_upper_bound)
return TransitionKey.__create([(char, char)])
+ @staticmethod
+ def __process_graph(graph, ranges):
+ key = graph[0]
+ if key == 'RANGE':
+ ranges.append((ord(graph[1]), ord(graph[2])))
+ elif key == 'LITERAL':
+ ranges.append((ord(graph[1]), ord(graph[1])))
+ elif key == 'CAT':
+ for x in [graph[1], graph[2]]:
+ TransitionKey.__process_graph(x, ranges)
+ else:
+ assert False, "bad key %s" % key
+
@staticmethod
def character_class(invert, graph):
- # TODO
- return TransitionKey.__create([(129, 129)])
+ ranges = []
+ TransitionKey.__process_graph(graph, ranges)
+ return TransitionKey.__key_from_ranges(invert, ranges)
def matches_char(self, char):
char = ord(char)
@@ -139,13 +164,7 @@
return ", ".join(TransitionKey.__print_range(x) for x in self.__ranges)
@staticmethod
- def disjoint_keys(key_set):
- range_map = {}
- for x in key_set:
- for r in x.__ranges:
- if not r[0] in range_map:
- range_map[r[0]] = []
- range_map[r[0]].append(r[1])
+ def __disjoint_keys(range_map):
sort = lambda x : sorted(set(x))
range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))
ranges = []
@@ -165,5 +184,76 @@
if to_push:
current = range_map[i + 1]
range_map[i + 1] = (current[0], sort(current[1] + to_push))
- TransitionKey.__verify_ranges(ranges)
+ return ranges
+
+ @staticmethod
+ def disjoint_keys(key_set):
+ if not key_set:
+ return []
+ range_map = {}
+ for x in key_set:
+ for r in x.__ranges:
+ if not r[0] in range_map:
+ range_map[r[0]] = []
+ range_map[r[0]].append(r[1])
+ ranges = TransitionKey.__disjoint_keys(range_map)
+ TransitionKey.__verify_ranges(ranges, False)
return map(lambda x : TransitionKey.__create([x]), ranges)
+
+ @staticmethod
+ def __key_from_ranges(invert, ranges):
+ range_map = {}
+ for r in ranges:
+ if not r[0] in range_map:
+ range_map[r[0]] = []
+ range_map[r[0]].append(r[1])
+ ranges = TransitionKey.__disjoint_keys(range_map)
+ ranges = TransitionKey.__merge_ranges(ranges)
+ if invert:
+ ranges = TransitionKey.__invert_ranges(ranges)
+ return TransitionKey.__create(ranges)
+
+ @staticmethod
+ def __merge_ranges(ranges):
+ merged = []
+ last = None
+ for r in ranges:
+ if last == None:
+ last = r
+ elif (last[1] + 1 == r[0] and
+ r != TransitionKey.__unicode_whitespace_bounds and
+ r != TransitionKey.__unicode_literal_bounds):
+ last = (last[0], r[1])
+ else:
+ merged.append(last)
+ last = r
+ if last != None:
+ merged.append(last)
+ return merged
+
+ @staticmethod
+ def __invert_ranges(ranges):
+ inverted = []
+ last = None
+ contains_whitespace = False
+ contains_literal = False
+ for r in ranges:
+ if r == TransitionKey.__unicode_whitespace_bounds:
+ contains_whitespace = True
+ continue
+ if r == TransitionKey.__unicode_literal_bounds:
+ contains_literal = True
+ continue
+ if last == None:
+ if r[0] != TransitionKey.__lower_bound:
+ inverted.append((TransitionKey.__lower_bound, r[0] - 1))
+ elif last[1] + 1 < r[0]:
+ inverted.append((last[1] + 1, r[0] - 1))
+ last = r
+ if last != None and last[1] < TransitionKey.__latin_1_upper_bound:
+ inverted.append((last[1] + 1, TransitionKey.__latin_1_upper_bound))
+ if not contains_whitespace:
+ inverted.append(TransitionKey.__unicode_whitespace_bounds)
+ if not contains_literal:
+ inverted.append(TransitionKey.__unicode_literal_bounds)
+ return inverted
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.