Re: [lucy-dev] Extending the StandardTokenizer

Nick Wellnhofer Thu, 23 Feb 2012 04:57:12 -0800

On 22/02/2012 19:39, Marvin Humphrey wrote:

Okeedoke, let's see what you got. :)

See the attached patch. I moved the word break property lookup to a newmethod and override that method in a new "WordTokenizer" class.


Nick

diff --git a/core/Lucy/Analysis/StandardTokenizer.c 
b/core/Lucy/Analysis/StandardTokenizer.c
index 74b9e35..a16aa04 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -22,28 +22,6 @@
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
 
-/*
- * We use a modified version of the Word_Break property defined in UAX #29.
- * CR, LF, Newline and all undefined characters map to 0. WB_ASingle
- * designates characters that are Alphabetic but are excluded from ALetter.
- * WB_Extend_Format includes characters in both Extend and Format. The other
- * WB_* values correspond to the standard properties.
- *
- * The tables are in a compressed format that uses a three-stage lookup
- * scheme. They're generated with the perl script gen_word_break_tables.pl
- * in devel/bin.
- */
-
-#define WB_ASingle        1
-#define WB_ALetter        2
-#define WB_Numeric        3
-#define WB_Katakana       4
-#define WB_ExtendNumLet   5
-#define WB_Extend_Format  6
-#define WB_MidNumLet      7
-#define WB_MidLetter      8
-#define WB_MidNum         9
-
 #include "WordBreak.tab"
 
 typedef struct lucy_StringIter {
@@ -52,21 +30,19 @@ typedef struct lucy_StringIter {
 } lucy_StringIter;
 
 static int
-S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
-               Inversion *inversion);
-
-static int
-S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
-             int state, Inversion *inversion);
+S_parse_single(StandardTokenizer *self, const char *text, size_t len,
+               lucy_StringIter *iter, Inversion *inversion);
 
 static int
-S_wb_lookup(const char *ptr);
+S_parse_word(StandardTokenizer *self, const char *text, size_t len,
+             lucy_StringIter *iter, int state, Inversion *inversion);
 
 static void
 S_iter_advance(const char *text, lucy_StringIter *iter);
 
 static int
-S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter);
+S_skip_extend_format(StandardTokenizer *self, const char *text, size_t len,
+                     lucy_StringIter *iter);
 
 StandardTokenizer*
 StandardTokenizer_new() {
@@ -113,14 +89,14 @@ StandardTokenizer_tokenize_str(StandardTokenizer *self, 
const char *text,
     lucy_StringIter iter = { 0, 0 };
 
     while (iter.byte_pos < len) {
-        int wb = S_wb_lookup(text + iter.byte_pos);
+        int wb = StandardTokenizer_Word_Break_Property(self, text + 
iter.byte_pos);
 
         while (wb >= WB_ASingle && wb <= WB_ExtendNumLet) {
             if (wb == WB_ASingle) {
-                wb = S_parse_single(text, len, &iter, inversion);
+                wb = S_parse_single(self, text, len, &iter, inversion);
             }
             else {
-                wb = S_parse_word(text, len, &iter, wb, inversion);
+                wb = S_parse_word(self, text, len, &iter, wb, inversion);
             }
             if (iter.byte_pos >= len) return;
         }
@@ -137,10 +113,10 @@ StandardTokenizer_tokenize_str(StandardTokenizer *self, 
const char *text,
  * character.
  */
 static int
-S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
-               Inversion *inversion) {
+S_parse_single(StandardTokenizer *self, const char *text, size_t len,
+               lucy_StringIter *iter, Inversion *inversion) {
     lucy_StringIter start = *iter;
-    int wb = S_skip_extend_format(text, len, iter);
+    int wb = S_skip_extend_format(self, text, len, iter);
 
     Token *token = Token_new(text + start.byte_pos,
                              iter->byte_pos - start.byte_pos,
@@ -156,15 +132,15 @@ S_parse_single(const char *text, size_t len, 
lucy_StringIter *iter,
  * character.
  */
 static int
-S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
-             int state, Inversion *inversion) {
+S_parse_word(StandardTokenizer *self, const char *text, size_t len,
+             lucy_StringIter *iter, int state, Inversion *inversion) {
     int wb = -1;
     lucy_StringIter start = *iter;
     S_iter_advance(text, iter);
     lucy_StringIter end = *iter;
 
     while (iter->byte_pos < len) {
-        wb = S_wb_lookup(text + iter->byte_pos);
+        wb = StandardTokenizer_Word_Break_Property(self, text + 
iter->byte_pos);
 
         switch (wb) {
             case WB_ALetter:
@@ -187,7 +163,7 @@ S_parse_word(const char *text, size_t len, lucy_StringIter 
*iter,
             case WB_MidNum:
                 if (state == WB_ALetter && wb != WB_MidNum
                     ||  state == WB_Numeric && wb != WB_MidLetter) {
-                    wb = S_skip_extend_format(text, len, iter);
+                    wb = S_skip_extend_format(self, text, len, iter);
                     if (wb == state) { break; }
                 }
                 goto word_break;
@@ -228,8 +204,8 @@ word_break:
 
 #define WB_TABLE_LOOKUP(table, id, index) table [ ((id) << 6) | (index) ]
 
-static int
-S_wb_lookup(const char *ptr) {
+int
+StandardTokenizer_word_break_property(StandardTokenizer *self, const char 
*ptr) {
     uint8_t start = *(uint8_t*)ptr++;
 
     if (start < 0x80) { return wb_ascii[start]; }
@@ -278,13 +254,14 @@ S_iter_advance(const char *text, lucy_StringIter *iter) {
  * Returns the word break property of the current character.
  */
 static int
-S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) {
+S_skip_extend_format(StandardTokenizer *self, const char *text, size_t len,
+                     lucy_StringIter *iter) {
     int wb = -1;
 
     do {
         S_iter_advance(text, iter);
         if (iter->byte_pos >= len) { break; }
-        wb = S_wb_lookup(text + iter->byte_pos);
+        wb = StandardTokenizer_Word_Break_Property(self, text + 
iter->byte_pos);
     } while (wb == WB_Extend_Format);
 
     return wb;
diff --git a/core/Lucy/Analysis/StandardTokenizer.cfh 
b/core/Lucy/Analysis/StandardTokenizer.cfh
index 40c0456..961b5c7 100644
--- a/core/Lucy/Analysis/StandardTokenizer.cfh
+++ b/core/Lucy/Analysis/StandardTokenizer.cfh
@@ -50,8 +50,35 @@ class Lucy::Analysis::StandardTokenizer
     Tokenize_Str(StandardTokenizer *self, const char *text, size_t len,
                  Inversion *inversion);
 
+    int
+    Word_Break_Property(StandardTokenizer *self, const char *text);
+
     public bool_t
     Equals(StandardTokenizer *self, Obj *other);
 }
 
+/*
+ * We use a modified version of the Word_Break property defined in UAX #29.
+ * CR, LF, Newline and all undefined characters map to 0. WB_ASingle
+ * designates characters that are Alphabetic but are excluded from ALetter.
+ * WB_Extend_Format includes characters in both Extend and Format. The other
+ * WB_* values correspond to the standard properties.
+ *
+ * The tables are in a compressed format that uses a three-stage lookup
+ * scheme. They're generated with the perl script gen_word_break_tables.pl
+ * in devel/bin.
+ */
+
+__C__
+#define WB_ASingle        1
+#define WB_ALetter        2
+#define WB_Numeric        3
+#define WB_Katakana       4
+#define WB_ExtendNumLet   5
+#define WB_Extend_Format  6
+#define WB_MidNumLet      7
+#define WB_MidLetter      8
+#define WB_MidNum         9
+__END_C__
+
 
diff --git a/core/LucyX/Analysis/WordTokenizer.c 
b/core/LucyX/Analysis/WordTokenizer.c
new file mode 100644
index 0000000..62c2e26
--- /dev/null
+++ b/core/LucyX/Analysis/WordTokenizer.c
@@ -0,0 +1,56 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_WORDTOKENIZER
+#include "Lucy/Util/ToolSet.h"
+
+#include "LucyX/Analysis/WordTokenizer.h"
+
+WordTokenizer*
+WordTokenizer_new() {
+    WordTokenizer *self = (WordTokenizer*)VTable_Make_Obj(WORDTOKENIZER);
+    return WordTokenizer_init(self);
+}
+
+WordTokenizer*
+WordTokenizer_init(WordTokenizer *self) {
+    StandardTokenizer_init((StandardTokenizer*)self);
+    return self;
+}
+
+int
+WordTokenizer_word_break_property(WordTokenizer *self, const char *ptr) {
+    WordTokenizer_word_break_property_t super_word_break_property
+        = (WordTokenizer_word_break_property_t)SUPER_METHOD(WORDTOKENIZER,
+                WordTokenizer, Word_Break_Property);
+    int wb = super_word_break_property(self, ptr);
+
+    if (wb == WB_MidNumLet || wb == WB_MidLetter || wb == WB_MidNum) {
+        wb = 0;
+    }
+
+    return wb;
+}
+
+bool_t
+WordTokenizer_equals(WordTokenizer *self, Obj *other) {
+    WordTokenizer *const twin = (WordTokenizer*)other;
+    if (twin == self)                    { return true; }
+    if (!Obj_Is_A(other, WORDTOKENIZER)) { return false; }
+    return true;
+}
+
+
diff --git a/core/LucyX/Analysis/WordTokenizer.cfh 
b/core/LucyX/Analysis/WordTokenizer.cfh
new file mode 100644
index 0000000..d7db125
--- /dev/null
+++ b/core/LucyX/Analysis/WordTokenizer.cfh
@@ -0,0 +1,47 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+/** Split a string into word-like tokens.
+ *
+ * LucyX::Analysis::WordTokenizer works like
+ * L<StandardTokenizer|Lucy::Analysis::StandardTokenizer> but also splits
+ * at MidLetter, MidNum, and MidNumLet characters, for example apostrophes.
+ *
+ * The result is similar to using a '\w+' regex. Compared to
+ * L<RegexTokenizer|Lucy::Analysis::RegexTokenizer>, WordTokenizer has
+ * better Unicode support and runs faster.
+ */
+class LucyX::Analysis::WordTokenizer
+    inherits Lucy::Analysis::StandardTokenizer {
+
+    inert incremented WordTokenizer*
+    new();
+
+    /** Constructor.  Takes no arguments.
+     */
+    public inert WordTokenizer*
+    init(WordTokenizer *self);
+
+    int
+    Word_Break_Property(WordTokenizer *self, const char *text);
+
+    public bool_t
+    Equals(WordTokenizer *self, Obj *other);
+}
+
+
diff --git a/perl/buildlib/LucyX/Build/Binding/Analysis.pm 
b/perl/buildlib/LucyX/Build/Binding/Analysis.pm
new file mode 100644
index 0000000..b958b57
--- /dev/null
+++ b/perl/buildlib/LucyX/Build/Binding/Analysis.pm
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+package LucyX::Build::Binding::Analysis;
+use strict;
+use warnings;
+
+sub bind_all {
+    my $class = shift;
+    $class->bind_wordtokenizer;
+}
+
+sub bind_wordtokenizer {
+    my $pod_spec = Clownfish::CFC::Binding::Perl::Pod->new;
+    my $synopsis = <<'END_SYNOPSIS';
+    my $tokenizer = LucyX::Analysis::WordTokenizer->new;
+
+    # Then... once you have a tokenizer, put it into a PolyAnalyzer:
+    my $polyanalyzer = Lucy::Analysis::PolyAnalyzer->new(
+        analyzers => [ $tokenizer, $normalizer, $stemmer ], );
+END_SYNOPSIS
+    my $constructor = <<'END_CONSTRUCTOR';
+    my $tokenizer = LucyX::Analysis::WordTokenizer->new;
+END_CONSTRUCTOR
+    $pod_spec->set_synopsis($synopsis);
+    $pod_spec->add_constructor( alias => 'new', sample => $constructor );
+
+    my $binding = Clownfish::CFC::Binding::Perl::Class->new(
+        parcel     => "Lucy",
+        class_name => "LucyX::Analysis::WordTokenizer",
+    );
+    $binding->set_pod_spec($pod_spec);
+
+    Clownfish::CFC::Binding::Perl::Class->register($binding);
+}
+
+1;
diff --git a/perl/lib/LucyX/Analysis/WordTokenizer.pm 
b/perl/lib/LucyX/Analysis/WordTokenizer.pm
new file mode 100644
index 0000000..d34c8c4
--- /dev/null
+++ b/perl/lib/LucyX/Analysis/WordTokenizer.pm
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package LucyX::Analysis::WordTokenizer;
+use Lucy;
+
+1;
+
+__END__
+
+
diff --git a/perl/t/binding/160-word_tokenizer.t 
b/perl/t/binding/160-word_tokenizer.t
new file mode 100644
index 0000000..fc9f658
--- /dev/null
+++ b/perl/t/binding/160-word_tokenizer.t
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+use Test::More tests => 2;
+use Lucy::Test;
+
+my $tokenizer = LucyX::Analysis::WordTokenizer->new;
+my $other     = LucyX::Analysis::WordTokenizer->new;
+ok( $other->equals($other), "Equals" );
+
+my $tokens = $tokenizer->split("o'malley's");
+is_deeply( $tokens, [ qw(o malley s) ], "multiple apostrophes" );
+

Re: [lucy-dev] Extending the StandardTokenizer

Reply via email to