[
http://issues.apache.org/jira/browse/NUTCH-36?page=comments#action_12330588 ]
Kerang Lv commented on NUTCH-36:
--------------------------------
Code of a kind can be used to perform third-part CJK word
segmentation in NutchAnalysis.jj. CJKTokenizer, a kind of bi-gram segmentation
, was used in the following example.
================================================================================
@@ -33,6 +33,7 @@
import org.apache.nutch.searcher.Query.Clause;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.cjk.CJKTokenizer;
import java.io.*;
import java.util.*;
@@ -81,6 +82,14 @@
PARSER_END(NutchAnalysis)
TOKEN_MGR_DECLS : {
+ /** use CJKTokenizer to process cjk character */
+ private CJKTokenizer cjkTokenizer = null;
+
+ /** a global cjk token */
+ private org.apache.lucene.analysis.Token cjkToken = null;
+
+ /** start offset of cjk sequence */
+ private int cjkStartOffset = 0;
/** Constructs a token manager for the provided Reader. */
public NutchAnalysisTokenManager(Reader reader) {
@@ -106,7 +115,46 @@
}
// chinese, japanese and korean characters
-| <SIGRAM: <CJK> >
+| <SIGRAM: (<CJK>)+ >
+ {
+ /**
+ * use an instance of CJKTokenizer, cjkTokenizer, hold the maximum
+ * matched cjk chars, and cjkToken for the current token;
+ * reset matchedToken.image use cjkToken.termText();
+ * reset matchedToken.beginColumn use cjkToken.startOffset();
+ * reset matchedToken.endColumn use cjkToken.endOffset();
+ * backup the last char when the next cjkToken is valid.
+ */
+ if(cjkTokenizer == null) {
+ cjkTokenizer = new CJKTokenizer(new StringReader(image.toString()));
+ cjkStartOffset = matchedToken.beginColumn;
+ try {
+ cjkToken = cjkTokenizer.next();
+ } catch(IOException ioe) {
+ cjkToken = null;
+ }
+ }
+
+ if(cjkToken != null && !cjkToken.termText().equals("")) {
+ //sometime the cjkTokenizer returns an empty string, is it a bug?
+ matchedToken.image = cjkToken.termText();
+ matchedToken.beginColumn = cjkStartOffset + cjkToken.startOffset();
+ matchedToken.endColumn = cjkStartOffset + cjkToken.endOffset();
+ try {
+ cjkToken = cjkTokenizer.next();
+ } catch(IOException ioe) {
+ cjkToken = null;
+ }
+ if(cjkToken != null && !cjkToken.termText().equals("")) {
+ input_stream.backup(1);
+ }
+ }
+
+ if(cjkToken == null || cjkToken.termText().equals("")) {
+ cjkTokenizer = null;
+ cjkStartOffset = 0;
+ }
+ }
> Chinese in Nutch
> ----------------
>
> Key: NUTCH-36
> URL: http://issues.apache.org/jira/browse/NUTCH-36
> Project: Nutch
> Type: Improvement
> Components: indexer, searcher
> Environment: all
> Reporter: Jack Tang
> Priority: Minor
> Attachments: 桌
>
> Nutch now support Chinese in very simple way: NutchAnalysis segments CJK term
> word-by-word.
> So, if I search Chinese term 'FooBar'(two Chinese words: 'Foo' and 'Bar'),
> the result in web gui will highlight 'FooBar' and 'Foo', 'Bar'. While we
> expect Nutch only highlights 'FooBar'.
--
This message is automatically generated by JIRA.
-
If you think it was sent incorrectly contact one of the administrators:
http://issues.apache.org/jira/secure/Administrators.jspa
-
For more information on JIRA, see:
http://www.atlassian.com/software/jira