Reviewers: dcarney,
Message:
Committed patchset #2 manually as r17936.
Description:
Experimental lexer generator: generate code for utf-16 character classes.
[email protected]
[email protected]
BUG=
Committed: https://code.google.com/p/v8/source/detail?r=17936
Please review this at https://codereview.chromium.org/78713002/
SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser
Affected files (+48, -28 lines):
M src/char-predicates.h
M src/lexer/lexer.gyp
M src/lexer/lexer_py.re
M src/scanner.h
M tools/lexer_generator/code_generator.jinja
M tools/lexer_generator/code_generator.py
M tools/lexer_generator/generator.py
M tools/lexer_generator/transition_key_test.py
M tools/lexer_generator/transition_keys.py
Index: src/char-predicates.h
diff --git a/src/char-predicates.h b/src/char-predicates.h
index
767ad6513afa7e3d5995a31a1df22372bf7a9484..6e40e77b3bee601240df835d5c8fb0f942cd14d2
100644
--- a/src/char-predicates.h
+++ b/src/char-predicates.h
@@ -66,6 +66,16 @@ struct IdentifierPart {
}
};
+struct IdentifierPartNotLetter {
+ static inline bool Is(uc32 c) {
+ return unibrow::Number::Is(c)
+ || c == 0x200C // U+200C is Zero-Width Non-Joiner.
+ || c == 0x200D // U+200D is Zero-Width Joiner.
+ || unibrow::CombiningMark::Is(c)
+ || unibrow::ConnectorPunctuation::Is(c);
+ }
+};
+
} } // namespace v8::internal
#endif // V8_CHAR_PREDICATES_H_
Index: src/lexer/lexer.gyp
diff --git a/src/lexer/lexer.gyp b/src/lexer/lexer.gyp
index
03a74dfb2edd412e7dc23c69105728f6146476a8..4b961e78ad4ef52640061756fdfb8958dcef58f8
100644
--- a/src/lexer/lexer.gyp
+++ b/src/lexer/lexer.gyp
@@ -74,7 +74,7 @@
'../../tools/lexer_generator/generator.py',
'--re=../../src/lexer/lexer_py.re',
'--code=<(SHARED_INTERMEDIATE_DIR)/generated_lexer_latin1.cc',
- '--char-type=uint8_t',
+ '--encoding=latin1',
],
},
{
@@ -92,7 +92,7 @@
'../../tools/lexer_generator/generator.py',
'--re=../../src/lexer/lexer_py.re',
'--code=<(SHARED_INTERMEDIATE_DIR)/generated_lexer_utf16.cc',
- '--char-type=uint16_t',
+ '--encoding=utf16',
],
},
],
Index: src/lexer/lexer_py.re
diff --git a/src/lexer/lexer_py.re b/src/lexer/lexer_py.re
index
e57f88b147a3b351fe39593c7e64ac40e3c9dfec..9644b4443a4f590a9684527b5f5fabf0d67fa364
100644
--- a/src/lexer/lexer_py.re
+++ b/src/lexer/lexer_py.re
@@ -25,10 +25,10 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-whitespace_char = [ \t\v\f\r:ws:\240];
+whitespace_char = [ \t\v\f\r:whitespace:\240];
whitespace = whitespace_char+;
-identifier_start = [$_a-zA-Z:lit:];
-identifier_char = [0-9:identifier_start:];
+identifier_start = [$_a-zA-Z:letter:];
+identifier_char = [0-9:identifier_part_not_letter::identifier_start:];
line_terminator = [\n\r];
digit = [0-9];
hex_digit = [0-9a-fA-F];
Index: src/scanner.h
diff --git a/src/scanner.h b/src/scanner.h
index
3cefc833ac3323c536ca98015fdfbee1522060d5..4a58b1e366b6c3019444897cd111e0c15e2a8567
100644
--- a/src/scanner.h
+++ b/src/scanner.h
@@ -139,12 +139,18 @@ class UnicodeCache {
bool IsIdentifierPart(unibrow::uchar c) { return
kIsIdentifierPart.get(c); }
bool IsLineTerminator(unibrow::uchar c) { return
kIsLineTerminator.get(c); }
bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
+ bool IsLetter(unibrow::uchar c) { return kIsLetter.get(c); }
+ bool IsIdentifierPartNotLetter(unibrow::uchar c) {
+ return kIsIdentifierPartNotLetter.get(c);
+ }
private:
unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
+ unibrow::Predicate<unibrow::Letter, 128> kIsLetter;
+ unibrow::Predicate<IdentifierPartNotLetter, 128>
kIsIdentifierPartNotLetter;
StaticResource<Utf8Decoder> utf8_decoder_;
DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
Index: tools/lexer_generator/code_generator.jinja
diff --git a/tools/lexer_generator/code_generator.jinja
b/tools/lexer_generator/code_generator.jinja
index
9db497c4d9f1aeea42d422ea30e24b67bfed72a8..4ed43af92bed51c5143b80432a0271a028032c87
100644
--- a/tools/lexer_generator/code_generator.jinja
+++ b/tools/lexer_generator/code_generator.jinja
@@ -1,6 +1,5 @@
#include "lexer/even-more-experimental-scanner.h"
-{# TODO implement CLASS checks #}
{%- macro do_key(key) -%}
{%- for r in key -%}
{%- if not loop.first %} || {% endif -%}
@@ -19,8 +18,15 @@
(yych == 0 && cursor_ >= buffer_end_)
{%- elif r[1] == 'zero' -%}
(yych == 0 && cursor_ < buffer_end_)
+ {%- elif r[1] == 'whitespace' and encoding == 'utf16'-%}
+ unicode_cache_->IsWhiteSpace(yych)
+ {%- elif r[1] == 'letter' and encoding == 'utf16'-%}
+ {# FIXME: Add and use unicode_cache_->InNonAsciiLetter #}
+ (!(yych >= 'a' && yych <= 'z') && !(yych >= 'A' && yych <= 'Z')
&& unicode_cache_->IsLetter(yych))
+ {%- elif r[1] == 'identifier_part_not_letter' and encoding
== 'utf16'-%}
+ unicode_cache_->IsIdentifierPartNotLetter(yych)
{%- else -%}
- false
+ false /* {{r[1]}} */
{%- endif -%}
{%- else -%}
false
Index: tools/lexer_generator/code_generator.py
diff --git a/tools/lexer_generator/code_generator.py
b/tools/lexer_generator/code_generator.py
index
3dae1f29deb24b5ff38e55ef2a4ee894f2b401c8..823815edaa93169461603c66e97a90dba8e8700b
100644
--- a/tools/lexer_generator/code_generator.py
+++ b/tools/lexer_generator/code_generator.py
@@ -35,7 +35,7 @@ class CodeGenerator:
def __init__(self,
rule_processor,
- char_type,
+ encoding = 'latin1',
minimize_default = True,
inline = True,
switching = True,
@@ -52,7 +52,7 @@ class CodeGenerator:
self.__log = log
self.__inline = inline
self.__switching = switching
- self.__char_type = char_type
+ self.__encoding = encoding
def __state_cmp(self, left, right):
if left['original_node_number'] == self.__start_node_number:
@@ -242,9 +242,16 @@ class CodeGenerator:
undefined = jinja2.StrictUndefined)
template = template_env.get_template('code_generator.jinja')
+ if self.__encoding == 'latin1':
+ char_type = 'uint8_t'
+ elif self.__encoding == 'utf16':
+ char_type = 'uint16_t'
+ else:
+ raise Exception('Unsupported encoding %s' % encoding)
return template.render(
start_node_number = 0,
debug_print = self.__debug_print,
default_action = default_action,
dfa_states = dfa_states,
- char_type = self.__char_type)
+ encoding = self.__encoding,
+ char_type = char_type)
Index: tools/lexer_generator/generator.py
diff --git a/tools/lexer_generator/generator.py
b/tools/lexer_generator/generator.py
index
4e228304e553c502f9ffbb7fa45acdfc9496494f..5da2446a79a191cafeaf5719ffa32fe80620b986
100644
--- a/tools/lexer_generator/generator.py
+++ b/tools/lexer_generator/generator.py
@@ -97,7 +97,7 @@ if __name__ == '__main__':
parser.add_argument('--re', default='src/lexer/lexer_py.re')
parser.add_argument('--input')
parser.add_argument('--code')
- parser.add_argument('--char-type')
+ parser.add_argument('--encoding', default='latin1')
parser.add_argument('--no-minimize-default', action='store_true')
parser.add_argument('--no-verify-default', action='store_true')
parser.add_argument('--no-inline', action='store_true')
@@ -133,13 +133,9 @@ if __name__ == '__main__':
print "wrote html to %s" % html_file
code_file = args.code
- char_type = args.char_type
- if not char_type:
- char_type = 'uint8_t'
-
if code_file:
code_generator = CodeGenerator(rule_processor,
- char_type,
+ encoding = args.encoding,
minimize_default = minimize_default,
log = verbose,
inline = not args.no_inline,
Index: tools/lexer_generator/transition_key_test.py
diff --git a/tools/lexer_generator/transition_key_test.py
b/tools/lexer_generator/transition_key_test.py
index
3180b02052b5e31070e4e38e6c6828ab67ba6f0e..18ccc71be252efc013a52e6003fc9b35aa764200
100644
--- a/tools/lexer_generator/transition_key_test.py
+++ b/tools/lexer_generator/transition_key_test.py
@@ -51,7 +51,7 @@ class TransitionKeyTestCase(unittest.TestCase):
("1-2", "12", "ab"),
("a-zA-Z", "abyzABYZ" , "123"),
("a-zA-Z0g" , "abyzABYZ0" , "123"),
- ("a-z:ws::lit:" , "abc" , "123"),
+ ("a-z:whitespace::letter:" , "abc" , "123"),
]
classes = {}
for (string, match, no_match) in data:
Index: tools/lexer_generator/transition_keys.py
diff --git a/tools/lexer_generator/transition_keys.py
b/tools/lexer_generator/transition_keys.py
index
f2035d4f4aecd476898b7879c165663ac412fbaf..5fda7affc2e98b86b0b6b95409da7ca1d861bb9e
100644
--- a/tools/lexer_generator/transition_keys.py
+++ b/tools/lexer_generator/transition_keys.py
@@ -41,9 +41,10 @@ class TransitionKey:
# These are not real ranges; they just need to be separate from any
real
# ranges.
'whitespace' : (256, 256),
- 'literal' : (257, 257),
- 'eos' : (258, 258),
- 'zero' : (259, 259),
+ 'letter' : (257, 257),
+ 'identifier_part_not_letter' : (258, 258),
+ 'eos' : (259, 259),
+ 'zero' : (260, 260),
}
__lower_bound = 1
__upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]
@@ -139,14 +140,8 @@ class TransitionKey:
TransitionKey.__process_graph(x, ranges, key_map)
elif key == 'CHARACTER_CLASS':
class_name = graph[1]
- if class_name == 'ws':
- ranges.append(TransitionKey.__class_bounds['whitespace'])
- elif class_name == 'lit':
- ranges.append(TransitionKey.__class_bounds['literal'])
- elif class_name == 'eos':
- ranges.append(TransitionKey.__class_bounds['eos'])
- elif class_name == 'zero':
- ranges.append(TransitionKey.__class_bounds['zero'])
+ if class_name in TransitionKey.__class_bounds.keys():
+ ranges.append(TransitionKey.__class_bounds[class_name])
elif class_name in key_map:
ranges += key_map[class_name].__ranges
else:
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.