Hi Chris, The sample code "MyAnalyzer" is not just for future. If you like, you can use it in your code directly or make some modifications according to your needs. It splits the tokens that contains '-' and ','.
DIGY -----Original Message----- From: Chris Martin [mailto:[EMAIL PROTECTED] Sent: Saturday, March 17, 2007 5:58 AM To: [email protected] Subject: RE: Help With Tokenization Dude! Thanks DIGY! This will prove to be useful in the future for sure. :) Chris Martin Software Developer myKB.com ? http://mykb.com ? [EMAIL PROTECTED] ? +1 602-326-5200 -----Original Message----- From: Digy [mailto:[EMAIL PROTECTED] Sent: Friday, March 16, 2007 7:53 PM To: [email protected] Subject: RE: Help With Tokenization Hi Chris, You can write your own analyzer as below to split the text(and use it in both indexing and searching). DIGY public class MyAnalyzer : Lucene.Net.Analysis.Analyzer { public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader) { Lucene.Net.Analysis.TokenStream result = new Lucene.Net.Analysis.Standard.StandardTokenizer(reader); result = new Lucene.Net.Analysis.Standard.StandardFilter(result); result = new SplitterFilter(result); return result; } class SplitterFilter : Lucene.Net.Analysis.TokenStream { Lucene.Net.Analysis.TokenStream stream = null; List<Lucene.Net.Analysis.Token> tokens = new List<Lucene.Net.Analysis.Token>(); char[] Separators = new char[] { ',', '-'}; public SplitterFilter(Lucene.Net.Analysis.TokenStream stream) { this.stream = stream; } public override Lucene.Net.Analysis.Token Next() { if (tokens.Count > 0) { Lucene.Net.Analysis.Token t = tokens[0]; tokens.RemoveAt(0); return t; } Lucene.Net.Analysis.Token token = stream.Next(); if (token == null) return null; string termText = token.TermText(); string[] subTokens = termText.Split(Separators, StringSplitOptions.RemoveEmptyEntries); if (subTokens.Length > 1) { int tokenOffset = subTokens[0].Length; for (int i = 1; i < subTokens.Length; i++) { tokenOffset = termText.IndexOf(subTokens[i], tokenOffset); tokens.Add(new Lucene.Net.Analysis.Token( subTokens[i], token.StartOffset(), token.StartOffset() + subTokens[i].Length, Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Ne t.Analysis.Standard.StandardTokenizerConstants.ALPHANUM] ) ); tokenOffset += subTokens[i].Length; } return new Lucene.Net.Analysis.Token( subTokens[0], token.StartOffset(), token.StartOffset() + subTokens[0].Length, Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Ne t.Analysis.Standard.StandardTokenizerConstants.ALPHANUM] ); } else { return token; } } public override void Close() { this.stream.Close(); } } } -----Original Message----- From: Martin, Chris [mailto:[EMAIL PROTECTED] Sent: Friday, March 16, 2007 10:24 PM To: [email protected] Subject: Help With Tokenization I have a field that I currently store as a comma separated list of Guid objects. This field is crucial to our search strategy. I can't figure out how to get those guid objects to be tokenized. I'm playing with the idea of a custom Analyzer and TokenFilter to try and do this but, I'm not sure that's the way to go here. As you tell, I'm pretty new to Lucene and can't find any good documentation. J Thanks Chris Martin Software Developer - myKB.com http://mykb.com U [EMAIL PROTECTED] O+1 480-424-6952 x124
