Dude! Thanks DIGY! This will prove to be useful in the future for sure. :)

Chris Martin
Software Developer – myKB.com
  http://mykb.com    [EMAIL PROTECTED]    +1 602-326-5200
-----Original Message-----
From: Digy [mailto:[EMAIL PROTECTED] 
Sent: Friday, March 16, 2007 7:53 PM
To: [email protected]
Subject: RE: Help With Tokenization

Hi Chris,

You can write your own analyzer as below to split the text(and use it in
both indexing and searching).


DIGY


public class MyAnalyzer : Lucene.Net.Analysis.Analyzer
    {
        public override Lucene.Net.Analysis.TokenStream TokenStream(string
fieldName, TextReader reader)
        {
            Lucene.Net.Analysis.TokenStream result = new
Lucene.Net.Analysis.Standard.StandardTokenizer(reader);
            result = new
Lucene.Net.Analysis.Standard.StandardFilter(result);
            result = new SplitterFilter(result);

            return result;
        }



        class SplitterFilter : Lucene.Net.Analysis.TokenStream
        {
            Lucene.Net.Analysis.TokenStream stream = null;
            List<Lucene.Net.Analysis.Token> tokens = new
List<Lucene.Net.Analysis.Token>();

            char[] Separators = new char[] { ',', '-'};

            public SplitterFilter(Lucene.Net.Analysis.TokenStream stream)
            {
                this.stream = stream;
            }

            public override Lucene.Net.Analysis.Token Next()
            {
                if (tokens.Count > 0)
                {
                    Lucene.Net.Analysis.Token t = tokens[0];
                    tokens.RemoveAt(0);
                    return t;
                }

                Lucene.Net.Analysis.Token token = stream.Next();
                if (token == null) return null;

                string termText = token.TermText();
                string[] subTokens = termText.Split(Separators,
StringSplitOptions.RemoveEmptyEntries);
                if (subTokens.Length > 1)
                {
                    int tokenOffset = subTokens[0].Length;
                    for (int i = 1; i < subTokens.Length; i++)
                    {
                        tokenOffset = termText.IndexOf(subTokens[i],
tokenOffset);
                        tokens.Add(new Lucene.Net.Analysis.Token(
                                        subTokens[i],
                                        token.StartOffset(),
                                        token.StartOffset() +
subTokens[i].Length,
 
Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Ne
t.Analysis.Standard.StandardTokenizerConstants.ALPHANUM]
                                       )
                                  );
                        tokenOffset += subTokens[i].Length;
                    }

                    return new Lucene.Net.Analysis.Token(
                                                        subTokens[0],
                                                        token.StartOffset(),
                                                        token.StartOffset()
+ subTokens[0].Length,
 
Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Ne
t.Analysis.Standard.StandardTokenizerConstants.ALPHANUM]
                                                      );
                }
                else
                {
                    return token;
                }
            }

            public override void Close()
            {
                this.stream.Close();
            }
        }

    }

-----Original Message-----
From: Martin, Chris [mailto:[EMAIL PROTECTED] 
Sent: Friday, March 16, 2007 10:24 PM
To: [email protected]
Subject: Help With Tokenization

I have a field that I currently store as a comma separated list of Guid
objects. This field is crucial to our search strategy.

 

I can't figure out how to get those guid objects to be tokenized. I'm
playing with the idea of a custom Analyzer and TokenFilter to try and do
this but, I'm not sure that's the way to go here. 

 

As you tell, I'm pretty new to Lucene and can't find any good documentation.
J

 

Thanks

 

Chris Martin

Software Developer - myKB.com

ü http://mykb.com  U [EMAIL PROTECTED] O+1 480-424-6952 x124

 



Reply via email to