On 22/02/2012 19:39, Marvin Humphrey wrote:
Okeedoke, let's see what you got. :)
See the attached patch. I moved the word break property lookup to a new
method and override that method in a new "WordTokenizer" class.
Nick
diff --git a/core/Lucy/Analysis/StandardTokenizer.c
b/core/Lucy/Analysis/StandardTokenizer.c
index 74b9e35..a16aa04 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -22,28 +22,6 @@
#include "Lucy/Analysis/Token.h"
#include "Lucy/Analysis/Inversion.h"
-/*
- * We use a modified version of the Word_Break property defined in UAX #29.
- * CR, LF, Newline and all undefined characters map to 0. WB_ASingle
- * designates characters that are Alphabetic but are excluded from ALetter.
- * WB_Extend_Format includes characters in both Extend and Format. The other
- * WB_* values correspond to the standard properties.
- *
- * The tables are in a compressed format that uses a three-stage lookup
- * scheme. They're generated with the perl script gen_word_break_tables.pl
- * in devel/bin.
- */
-
-#define WB_ASingle 1
-#define WB_ALetter 2
-#define WB_Numeric 3
-#define WB_Katakana 4
-#define WB_ExtendNumLet 5
-#define WB_Extend_Format 6
-#define WB_MidNumLet 7
-#define WB_MidLetter 8
-#define WB_MidNum 9
-
#include "WordBreak.tab"
typedef struct lucy_StringIter {
@@ -52,21 +30,19 @@ typedef struct lucy_StringIter {
} lucy_StringIter;
static int
-S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
- Inversion *inversion);
-
-static int
-S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
- int state, Inversion *inversion);
+S_parse_single(StandardTokenizer *self, const char *text, size_t len,
+ lucy_StringIter *iter, Inversion *inversion);
static int
-S_wb_lookup(const char *ptr);
+S_parse_word(StandardTokenizer *self, const char *text, size_t len,
+ lucy_StringIter *iter, int state, Inversion *inversion);
static void
S_iter_advance(const char *text, lucy_StringIter *iter);
static int
-S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter);
+S_skip_extend_format(StandardTokenizer *self, const char *text, size_t len,
+ lucy_StringIter *iter);
StandardTokenizer*
StandardTokenizer_new() {
@@ -113,14 +89,14 @@ StandardTokenizer_tokenize_str(StandardTokenizer *self,
const char *text,
lucy_StringIter iter = { 0, 0 };
while (iter.byte_pos < len) {
- int wb = S_wb_lookup(text + iter.byte_pos);
+ int wb = StandardTokenizer_Word_Break_Property(self, text +
iter.byte_pos);
while (wb >= WB_ASingle && wb <= WB_ExtendNumLet) {
if (wb == WB_ASingle) {
- wb = S_parse_single(text, len, &iter, inversion);
+ wb = S_parse_single(self, text, len, &iter, inversion);
}
else {
- wb = S_parse_word(text, len, &iter, wb, inversion);
+ wb = S_parse_word(self, text, len, &iter, wb, inversion);
}
if (iter.byte_pos >= len) return;
}
@@ -137,10 +113,10 @@ StandardTokenizer_tokenize_str(StandardTokenizer *self,
const char *text,
* character.
*/
static int
-S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
- Inversion *inversion) {
+S_parse_single(StandardTokenizer *self, const char *text, size_t len,
+ lucy_StringIter *iter, Inversion *inversion) {
lucy_StringIter start = *iter;
- int wb = S_skip_extend_format(text, len, iter);
+ int wb = S_skip_extend_format(self, text, len, iter);
Token *token = Token_new(text + start.byte_pos,
iter->byte_pos - start.byte_pos,
@@ -156,15 +132,15 @@ S_parse_single(const char *text, size_t len,
lucy_StringIter *iter,
* character.
*/
static int
-S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
- int state, Inversion *inversion) {
+S_parse_word(StandardTokenizer *self, const char *text, size_t len,
+ lucy_StringIter *iter, int state, Inversion *inversion) {
int wb = -1;
lucy_StringIter start = *iter;
S_iter_advance(text, iter);
lucy_StringIter end = *iter;
while (iter->byte_pos < len) {
- wb = S_wb_lookup(text + iter->byte_pos);
+ wb = StandardTokenizer_Word_Break_Property(self, text +
iter->byte_pos);
switch (wb) {
case WB_ALetter:
@@ -187,7 +163,7 @@ S_parse_word(const char *text, size_t len, lucy_StringIter
*iter,
case WB_MidNum:
if (state == WB_ALetter && wb != WB_MidNum
|| state == WB_Numeric && wb != WB_MidLetter) {
- wb = S_skip_extend_format(text, len, iter);
+ wb = S_skip_extend_format(self, text, len, iter);
if (wb == state) { break; }
}
goto word_break;
@@ -228,8 +204,8 @@ word_break:
#define WB_TABLE_LOOKUP(table, id, index) table [ ((id) << 6) | (index) ]
-static int
-S_wb_lookup(const char *ptr) {
+int
+StandardTokenizer_word_break_property(StandardTokenizer *self, const char
*ptr) {
uint8_t start = *(uint8_t*)ptr++;
if (start < 0x80) { return wb_ascii[start]; }
@@ -278,13 +254,14 @@ S_iter_advance(const char *text, lucy_StringIter *iter) {
* Returns the word break property of the current character.
*/
static int
-S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) {
+S_skip_extend_format(StandardTokenizer *self, const char *text, size_t len,
+ lucy_StringIter *iter) {
int wb = -1;
do {
S_iter_advance(text, iter);
if (iter->byte_pos >= len) { break; }
- wb = S_wb_lookup(text + iter->byte_pos);
+ wb = StandardTokenizer_Word_Break_Property(self, text +
iter->byte_pos);
} while (wb == WB_Extend_Format);
return wb;
diff --git a/core/Lucy/Analysis/StandardTokenizer.cfh
b/core/Lucy/Analysis/StandardTokenizer.cfh
index 40c0456..961b5c7 100644
--- a/core/Lucy/Analysis/StandardTokenizer.cfh
+++ b/core/Lucy/Analysis/StandardTokenizer.cfh
@@ -50,8 +50,35 @@ class Lucy::Analysis::StandardTokenizer
Tokenize_Str(StandardTokenizer *self, const char *text, size_t len,
Inversion *inversion);
+ int
+ Word_Break_Property(StandardTokenizer *self, const char *text);
+
public bool_t
Equals(StandardTokenizer *self, Obj *other);
}
+/*
+ * We use a modified version of the Word_Break property defined in UAX #29.
+ * CR, LF, Newline and all undefined characters map to 0. WB_ASingle
+ * designates characters that are Alphabetic but are excluded from ALetter.
+ * WB_Extend_Format includes characters in both Extend and Format. The other
+ * WB_* values correspond to the standard properties.
+ *
+ * The tables are in a compressed format that uses a three-stage lookup
+ * scheme. They're generated with the perl script gen_word_break_tables.pl
+ * in devel/bin.
+ */
+
+__C__
+#define WB_ASingle 1
+#define WB_ALetter 2
+#define WB_Numeric 3
+#define WB_Katakana 4
+#define WB_ExtendNumLet 5
+#define WB_Extend_Format 6
+#define WB_MidNumLet 7
+#define WB_MidLetter 8
+#define WB_MidNum 9
+__END_C__
+
diff --git a/core/LucyX/Analysis/WordTokenizer.c
b/core/LucyX/Analysis/WordTokenizer.c
new file mode 100644
index 0000000..62c2e26
--- /dev/null
+++ b/core/LucyX/Analysis/WordTokenizer.c
@@ -0,0 +1,56 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_WORDTOKENIZER
+#include "Lucy/Util/ToolSet.h"
+
+#include "LucyX/Analysis/WordTokenizer.h"
+
+WordTokenizer*
+WordTokenizer_new() {
+ WordTokenizer *self = (WordTokenizer*)VTable_Make_Obj(WORDTOKENIZER);
+ return WordTokenizer_init(self);
+}
+
+WordTokenizer*
+WordTokenizer_init(WordTokenizer *self) {
+ StandardTokenizer_init((StandardTokenizer*)self);
+ return self;
+}
+
+int
+WordTokenizer_word_break_property(WordTokenizer *self, const char *ptr) {
+ WordTokenizer_word_break_property_t super_word_break_property
+ = (WordTokenizer_word_break_property_t)SUPER_METHOD(WORDTOKENIZER,
+ WordTokenizer, Word_Break_Property);
+ int wb = super_word_break_property(self, ptr);
+
+ if (wb == WB_MidNumLet || wb == WB_MidLetter || wb == WB_MidNum) {
+ wb = 0;
+ }
+
+ return wb;
+}
+
+bool_t
+WordTokenizer_equals(WordTokenizer *self, Obj *other) {
+ WordTokenizer *const twin = (WordTokenizer*)other;
+ if (twin == self) { return true; }
+ if (!Obj_Is_A(other, WORDTOKENIZER)) { return false; }
+ return true;
+}
+
+
diff --git a/core/LucyX/Analysis/WordTokenizer.cfh
b/core/LucyX/Analysis/WordTokenizer.cfh
new file mode 100644
index 0000000..d7db125
--- /dev/null
+++ b/core/LucyX/Analysis/WordTokenizer.cfh
@@ -0,0 +1,47 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+/** Split a string into word-like tokens.
+ *
+ * LucyX::Analysis::WordTokenizer works like
+ * L<StandardTokenizer|Lucy::Analysis::StandardTokenizer> but also splits
+ * at MidLetter, MidNum, and MidNumLet characters, for example apostrophes.
+ *
+ * The result is similar to using a '\w+' regex. Compared to
+ * L<RegexTokenizer|Lucy::Analysis::RegexTokenizer>, WordTokenizer has
+ * better Unicode support and runs faster.
+ */
+class LucyX::Analysis::WordTokenizer
+ inherits Lucy::Analysis::StandardTokenizer {
+
+ inert incremented WordTokenizer*
+ new();
+
+ /** Constructor. Takes no arguments.
+ */
+ public inert WordTokenizer*
+ init(WordTokenizer *self);
+
+ int
+ Word_Break_Property(WordTokenizer *self, const char *text);
+
+ public bool_t
+ Equals(WordTokenizer *self, Obj *other);
+}
+
+
diff --git a/perl/buildlib/LucyX/Build/Binding/Analysis.pm
b/perl/buildlib/LucyX/Build/Binding/Analysis.pm
new file mode 100644
index 0000000..b958b57
--- /dev/null
+++ b/perl/buildlib/LucyX/Build/Binding/Analysis.pm
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+package LucyX::Build::Binding::Analysis;
+use strict;
+use warnings;
+
+sub bind_all {
+ my $class = shift;
+ $class->bind_wordtokenizer;
+}
+
+sub bind_wordtokenizer {
+ my $pod_spec = Clownfish::CFC::Binding::Perl::Pod->new;
+ my $synopsis = <<'END_SYNOPSIS';
+ my $tokenizer = LucyX::Analysis::WordTokenizer->new;
+
+ # Then... once you have a tokenizer, put it into a PolyAnalyzer:
+ my $polyanalyzer = Lucy::Analysis::PolyAnalyzer->new(
+ analyzers => [ $tokenizer, $normalizer, $stemmer ], );
+END_SYNOPSIS
+ my $constructor = <<'END_CONSTRUCTOR';
+ my $tokenizer = LucyX::Analysis::WordTokenizer->new;
+END_CONSTRUCTOR
+ $pod_spec->set_synopsis($synopsis);
+ $pod_spec->add_constructor( alias => 'new', sample => $constructor );
+
+ my $binding = Clownfish::CFC::Binding::Perl::Class->new(
+ parcel => "Lucy",
+ class_name => "LucyX::Analysis::WordTokenizer",
+ );
+ $binding->set_pod_spec($pod_spec);
+
+ Clownfish::CFC::Binding::Perl::Class->register($binding);
+}
+
+1;
diff --git a/perl/lib/LucyX/Analysis/WordTokenizer.pm
b/perl/lib/LucyX/Analysis/WordTokenizer.pm
new file mode 100644
index 0000000..d34c8c4
--- /dev/null
+++ b/perl/lib/LucyX/Analysis/WordTokenizer.pm
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package LucyX::Analysis::WordTokenizer;
+use Lucy;
+
+1;
+
+__END__
+
+
diff --git a/perl/t/binding/160-word_tokenizer.t
b/perl/t/binding/160-word_tokenizer.t
new file mode 100644
index 0000000..fc9f658
--- /dev/null
+++ b/perl/t/binding/160-word_tokenizer.t
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+use Test::More tests => 2;
+use Lucy::Test;
+
+my $tokenizer = LucyX::Analysis::WordTokenizer->new;
+my $other = LucyX::Analysis::WordTokenizer->new;
+ok( $other->equals($other), "Equals" );
+
+my $tokens = $tokenizer->split("o'malley's");
+is_deeply( $tokens, [ qw(o malley s) ], "multiple apostrophes" );
+