Repository: lucy Updated Branches: refs/heads/master 51f7418de -> 2bc4edbd9
Add and fix StandardTokenizer comments Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/2bc4edbd Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/2bc4edbd Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/2bc4edbd Branch: refs/heads/master Commit: 2bc4edbd9de809eb542f152ab8e19fda6fa77532 Parents: 51f7418 Author: Nick Wellnhofer <[email protected]> Authored: Mon Jan 12 15:41:38 2015 +0100 Committer: Nick Wellnhofer <[email protected]> Committed: Mon Jan 12 15:41:38 2015 +0100 ---------------------------------------------------------------------- core/Lucy/Analysis/StandardTokenizer.c | 19 +++++++++++-------- core/Lucy/Analysis/StandardTokenizer.cfh | 2 +- devel/bin/gen_word_break_data.pl | 16 +++++++++++++++- 3 files changed, 27 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/2bc4edbd/core/Lucy/Analysis/StandardTokenizer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c index 318dcd7..23b25e3 100644 --- a/core/Lucy/Analysis/StandardTokenizer.c +++ b/core/Lucy/Analysis/StandardTokenizer.c @@ -31,7 +31,7 @@ * * The tables are in a compressed format that uses a three-stage lookup * scheme. They're generated with the perl script gen_word_break_tables.pl - * in devel/bin. + * in devel/bin. The WB_* constants must match the values used in the script. */ #define WB_ASingle 1 @@ -137,9 +137,9 @@ StandardTokenizer_Tokenize_Utf8_IMP(StandardTokenizer *self, const char *text, /* * Parse a word consisting of a single codepoint followed by extend or * format characters. Used for Alphabetic characters that don't have the - * ALetter word break property: ideographs, Hiragana, and "complex content". - * Advances the iterator and returns the word break property of the current - * character. + * ALetter word break property: ideographs, Hiragana, and "complex context". + * Advances the iterator and returns the word break property of the character + * following the word. */ static int S_parse_single(const char *text, size_t len, lucy_StringIter *iter, @@ -156,9 +156,12 @@ S_parse_single(const char *text, size_t len, lucy_StringIter *iter, } /* - * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet - * character. Advances the iterator and returns the word break property of the - * current character. + * Parse a word starting with an ALetter, Hebrew_Letter, Numeric, Katakana, or + * ExtendNumLet character. Advances the iterator and returns the word break + * property of the character following the word. + * + * TODO: Words consisting only of ExtendNumLet characters (General_Category + * Pc, typically underscores) should be ignored. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, @@ -320,7 +323,7 @@ S_iter_advance(const char *text, lucy_StringIter *iter) { /* * Advances the iterator skipping over Extend and Format characters. - * Returns the word break property of the current character. + * Returns the word break property of the following character. */ static int S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) { http://git-wip-us.apache.org/repos/asf/lucy/blob/2bc4edbd/core/Lucy/Analysis/StandardTokenizer.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Analysis/StandardTokenizer.cfh b/core/Lucy/Analysis/StandardTokenizer.cfh index 2bac5c7..211309f 100644 --- a/core/Lucy/Analysis/StandardTokenizer.cfh +++ b/core/Lucy/Analysis/StandardTokenizer.cfh @@ -24,7 +24,7 @@ parcel Lucy; * * Lucy::Analysis::StandardTokenizer breaks up the text at the word * boundaries defined in Unicode Standard Annex #29. It then returns those - * words that start with an alphabetic or numeric character. + * words that contain alphabetic or numeric characters. */ public class Lucy::Analysis::StandardTokenizer inherits Lucy::Analysis::Analyzer { http://git-wip-us.apache.org/repos/asf/lucy/blob/2bc4edbd/devel/bin/gen_word_break_data.pl ---------------------------------------------------------------------- diff --git a/devel/bin/gen_word_break_data.pl b/devel/bin/gen_word_break_data.pl index c94d18a..eb64fea 100755 --- a/devel/bin/gen_word_break_data.pl +++ b/devel/bin/gen_word_break_data.pl @@ -95,7 +95,21 @@ my $alpha = UnicodeTable->read( map => { Alphabetic => 1 }, ); -# Set characters in Alphabetic but not in Word_Break to WB_ASingle = 1 +# Many characters don't have a Word_Break property and form a single word. +# In order to include them in the tokenizing process, we use a custom +# property "ASingle" with value 1. +# +# For now, this property is used for all Alphabetic characters without a +# Word_Break property: Ideographic, Hiragana, and Complex_Context. +# +# There are also non-alphabetic, numeric characters without a WordBreak +# property that possibly should be included: +# +# - Decimal numbers (General_Category Nd) with East_Asian_Width F (Fullwidth) +# - Other numbers (General_Category No) +# +# These are ignored for now. + for ( my $i = 0; $i < 0x30000; ++$i ) { if ( !$wb->lookup($i) && $alpha->lookup($i) ) { $wb->set( $i, 1 );
