Upgrade StandardTokenizer to Unicode 6.2.0
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/0df8da88 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/0df8da88 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/0df8da88 Branch: refs/heads/master Commit: 0df8da88c95ab33edf4a71c63f1d3f720a23117d Parents: 1dced22 Author: Nick Wellnhofer <[email protected]> Authored: Mon Mar 25 21:51:06 2013 +0100 Committer: Nick Wellnhofer <[email protected]> Committed: Mon Mar 25 21:51:06 2013 +0100 ---------------------------------------------------------------------- core/Lucy/Analysis/StandardTokenizer.c | 6 +- core/Lucy/Test/Analysis/TestStandardTokenizer.c | 12 +- devel/bin/UnicodeTable.pm | 5 +- devel/bin/gen_word_break_data.pl | 29 +- modules/unicode/ucd/WordBreak.tab | 946 +++++++++--------- modules/unicode/ucd/WordBreakTest.json | 511 ++++++++++ 6 files changed, 1034 insertions(+), 475 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/core/Lucy/Analysis/StandardTokenizer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c index a24c2b8..2a8665e 100644 --- a/core/Lucy/Analysis/StandardTokenizer.c +++ b/core/Lucy/Analysis/StandardTokenizer.c @@ -151,9 +151,9 @@ S_parse_single(const char *text, size_t len, lucy_StringIter *iter, } /* - * Parse a word starting with an ALetter, Numeric or Katakana character. - * Advances the iterator and returns the word break property of the current - * character. + * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet + * character. Advances the iterator and returns the word break property of the + * current character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/core/Lucy/Test/Analysis/TestStandardTokenizer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Test/Analysis/TestStandardTokenizer.c b/core/Lucy/Test/Analysis/TestStandardTokenizer.c index b7851a6..9e550a0 100644 --- a/core/Lucy/Test/Analysis/TestStandardTokenizer.c +++ b/core/Lucy/Test/Analysis/TestStandardTokenizer.c @@ -26,20 +26,24 @@ TestStandardTokenizer* TestStandardTokenizer_new(TestFormatter *formatter) { - TestStandardTokenizer *self = (TestStandardTokenizer*)VTable_Make_Obj(TESTSTANDARDTOKENIZER); + TestStandardTokenizer *self + = (TestStandardTokenizer*)VTable_Make_Obj(TESTSTANDARDTOKENIZER); return TestStandardTokenizer_init(self, formatter); } TestStandardTokenizer* -TestStandardTokenizer_init(TestStandardTokenizer *self, TestFormatter *formatter) { - return (TestStandardTokenizer*)TestBatch_init((TestBatch*)self, 984, formatter); +TestStandardTokenizer_init(TestStandardTokenizer *self, + TestFormatter *formatter) { + TestBatch_init((TestBatch*)self, 1084, formatter); + return self; } static void test_Dump_Load_and_Equals(TestBatch *batch) { StandardTokenizer *tokenizer = StandardTokenizer_new(); Obj *dump = StandardTokenizer_Dump(tokenizer); - StandardTokenizer *clone = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump); + StandardTokenizer *clone + = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump); TEST_TRUE(batch, StandardTokenizer_Equals(tokenizer, (Obj*)clone), http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/devel/bin/UnicodeTable.pm ---------------------------------------------------------------------- diff --git a/devel/bin/UnicodeTable.pm b/devel/bin/UnicodeTable.pm index b233bfd..6cd1ed5 100644 --- a/devel/bin/UnicodeTable.pm +++ b/devel/bin/UnicodeTable.pm @@ -16,6 +16,8 @@ package UnicodeTable; use strict; +use IO::File; + =head1 NAME UnicodeTable - Create compressed Unicode tables for C programs @@ -110,7 +112,8 @@ sub read { my $map = $opts->{map} or die('map missing'); $type = lc($type); - open( my $file, '<', $filename ) + my $file = IO::File->new; + $file->open( $filename, '<' ) or die("$filename: $!\n"); while ( my $line = $file->getline ) { http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/devel/bin/gen_word_break_data.pl ---------------------------------------------------------------------- diff --git a/devel/bin/gen_word_break_data.pl b/devel/bin/gen_word_break_data.pl old mode 100644 new mode 100755 index d72df05..9bcf916 --- a/devel/bin/gen_word_break_data.pl +++ b/devel/bin/gen_word_break_data.pl @@ -32,7 +32,7 @@ the UCD to JSON. UCD_SRC_DIR should point to a directory containing the files WordBreakProperty.txt, WordBreakTest.txt, and DerivedCoreProperties.txt from the Unicode Character Database available at -L<http://www.unicode.org/Public/6.0.0/ucd/>. +L<http://www.unicode.org/Public/6.2.0/ucd/>. =head1 OUTPUT FILES @@ -58,18 +58,19 @@ my $table_filename = "$output_dir/WordBreak.tab"; my $tests_filename = "$output_dir/WordBreakTest.json"; my %wb_map = ( - CR => 0, - LF => 0, - Newline => 0, - ALetter => 2, - Numeric => 3, - Katakana => 4, - ExtendNumLet => 5, - Extend => 6, - Format => 6, - MidNumLet => 7, - MidLetter => 8, - MidNum => 9, + CR => 0, + LF => 0, + Newline => 0, + Regional_Indicator => 0, # These are symbols, so ignore them. + ALetter => 2, + Numeric => 3, + Katakana => 4, + ExtendNumLet => 5, + Extend => 6, + Format => 6, + MidNumLet => 7, + MidLetter => 8, + MidNum => 9, ); my %opts; @@ -206,7 +207,7 @@ __DATA__ This file is generated with devel/bin/gen_word_break_data.pl. DO NOT EDIT! The contents of this file are derived from the Unicode Character Database, -version 6.0.0, available from http://www.unicode.org/Public/6.0.0/ucd/. +version 6.2.0, available from http://www.unicode.org/Public/6.2.0/ucd/. The Unicode copyright and permission notice follows. Copyright (c) 1991-2011 Unicode, Inc. All rights reserved. Distributed under
