I don't know if anyone has gotten in touch, but I have noticed that Troy made a ticket out of this and applied a patch.
https://issues.apache.org/jira/browse/LUCENENET-437 I know that isn't a direct answer to your response, but you could do a diff and see what was changed/ported/patched to answer the question. - Michael On Sat, Jul 16, 2011 at 5:32 PM, Jennifer Wilson < jennifer.wil...@researchintegrations.com> wrote: > Hi all, > > > > I’d really like to use the ShingleFilter in my Lucene.Net search app but it > has not yet been ported from Java. So, although I thought it’d be coding > on > the edges of my weight class, I thought I’d give porting it over from Java > to C# a try. I grabbed the source code from the version 2.9.2 package > org.apache.lucene.analysis.shingle.ShingleFilter.java and went on my way. > > > > But my lack of coding experience is catching up to me, I’ve hit a number of > stumbling blocks and unfortunately now I think I’ve hit an impasse. I’ve > been especially struggling with (and really just understanding what should > be happening in) the *IncrementToken() *method. So, I thought I’d post what > I have here (I hope this is the right place to do this – please let me know > if I should post this somewhere else). I’m pretty confident that the more > experienced coders out there will be able to just look at what I’ve done > and > see the problems immediately… > > > > Right now I’m getting the following error: > > System.SystemException: System.NullReferenceException: Object reference not > set to an instance of an object. > > at LuceneIndexer.ShingleFilter.FillShingleBuffer() in > C:\Users\..\ShingleFilter.cs:line 412 > > at LuceneIndexer.ShingleFilter.FillShingleBuffer() in > C:\Users\..\ShingleFilter.cs:line 429 > > at LuceneIndexer.ShingleFilter.IncrementToken() in > C:\Users\..\ShingleFilter.cs:line 216 > > > > I’d really appreciate any corrections, feedback, or resources that anyone > can offer. > > > > Thank you in advance for taking a look at this! > > - Jen > > > > /** > > * Licensed to the Apache Software Foundation (ASF) under one or more > > * contributor license agreements. See the NOTICE file distributed with > > * this work for additional information regarding copyright ownership. > > * The ASF licenses this file to You under the Apache License, Version 2.0 > > * (the "License"); you may not use this file except in compliance with > > * the License. You may obtain a copy of the License at > > * > > * http://www.apache.org/licenses/LICENSE-2.0 > > * > > * Unless required by applicable law or agreed to in writing, software > > * distributed under the License is distributed on an "AS IS" BASIS, > > * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > > * See the License for the specific language governing permissions and > > * limitations under the License. > > * > > */ > > > > /* ------------- > > * SHINGLEFILTER from org.apache.lucene.analysis.shingle.ShingleFilter > version 2.9.2 > > * ------------- > > */ > > > > using System; > > using System.Collections; > > using System.Text; > > using Lucene.Net.Analysis; > > using Lucene.Net.Analysis.Standard; > > > > using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute; > > using OffsetAttribute = > Lucene.Net.Analysis.Tokenattributes.OffsetAttribute; > > using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes. > PositionIncrementAttribute; > > using TypeAttribute = Lucene.Net.Analysis.Tokenattributes.TypeAttribute; > > using AttributeSource = Lucene.Net.Util.AttributeSource; > > > > /// <summary>A ShingleFilter constructs shingles (token n-grams) from a > token stream. > > /// In other words, it creates combinations of tokens as a single token. > > /// > > /// <p>For example, the sentence "please divide this sentence into > shingles" > > /// might be tokenized into shingles "please divide", "divide this", > > /// "this sentence", "sentence into", and "into shingles". > > /// > > /// <p>This filter handles position increments > 1 by inserting filler > tokens > > /// (tokens with termtext "_"). It does not handle a position increment of > 0. > > /// </summary> > > /// > > > > namespace LuceneIndexer > > { > > public class ShingleFilter : TokenFilter > > { > > /** maximum shingle size (number of tokens) */ > > internal int maxShingleSize; > > > > private TermAttribute termAtt; > > private OffsetAttribute offsetAtt; > > private PositionIncrementAttribute posIncrAtt; > > private TypeAttribute typeAtt; > > > > private ArrayList shingleBuf = new ArrayList(); > > private StringBuilder[] shingles; > > private string tokenType = "shingle"; > > > > > > /** filler token for when positionIncrement is more than 1 */ > > public static readonly char[] FILLER_TOKEN = { '_' }; > > > > /** default maximum shingle size is 2. */ > > public int DEFAULT_MAX_SHINGLE_SIZE = 2; > > > > /** The string to use when joining adjacent tokens to form a shingle > */ > > public const string TOKEN_SEPARATOR = " "; > > > > /** By default, we output unigrams (individual tokens) as well as > shingles (token n-grams). */ > > private bool outputUnigrams = true; > > > > > > /** > > * Constructs a ShingleFilter with the specified single size from the > > * {@link TokenStream} <code>input</code> > > * > > * @param input input stream > > * @param maxShingleSize maximum shingle size produced by the filter. > > */ > > public ShingleFilter(TokenStream in_Renamed, int maxShingleSize) : > base(in_Renamed) > > { > > try > > { > > this.maxShingleSize = maxShingleSize; > > termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute > )); > > offsetAtt = (OffsetAttribute)AddAttribute(typeof( > OffsetAttribute)); > > posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof > (PositionIncrementAttribute)); > > typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute > )); > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > /** > > * Construct a ShingleFilter with default shingle size. > > * > > * @param input input stream > > */ > > public ShingleFilter(TokenStream in_Renamed) : base(in_Renamed) > > { > > try > > { > > this.maxShingleSize = DEFAULT_MAX_SHINGLE_SIZE; > > termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute > )); > > offsetAtt = (OffsetAttribute)AddAttribute(typeof( > OffsetAttribute)); > > posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof > (PositionIncrementAttribute)); > > typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute > )); > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > /** > > * Construct a ShingleFilter with the specified token type for > shingle tokens. > > * > > * @param input input stream > > * @param tokenType token type for shingle tokens > > */ > > public ShingleFilter(TokenStream in_Renamed, string tokenType) : > base(in_Renamed) > > { > > try > > { > > this.maxShingleSize = DEFAULT_MAX_SHINGLE_SIZE; > > this.tokenType = tokenType; > > termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute > )); > > offsetAtt = (OffsetAttribute)AddAttribute(typeof( > OffsetAttribute)); > > posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof > (PositionIncrementAttribute)); > > typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute > )); > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > /** > > * Set the type of the shingle tokens produced by this filter. > > * (default: "shingle") > > * > > * @param tokenType token tokenType > > */ > > public void SetTokenType(String tokenType) > > { > > this.tokenType = tokenType; > > } > > > > /** > > * Shall the output stream contain the input tokens (unigrams) as > well as > > * shingles? (default: true.) > > * > > * @param outputUnigrams Whether or not the output stream shall > contain > > * the input tokens (unigrams) > > */ > > public void SetOutputUnigrams(bool outputUnigrams) > > { > > this.outputUnigrams = outputUnigrams; > > } > > > > /** > > * Set the max shingle size (default: 2) > > * > > * @param maxShingleSize max size of output shingles > > */ > > public void SetMaxShingleSize(int maxShingleSize) > > { > > if (maxShingleSize < 2) > > { > > throw new System.SystemException("Max shingle size must be > >= 2"); > > } > > this.shingles = new StringBuilder[maxShingleSize]; > > for (int i = 0; i < shingles.Length; i++) > > { > > shingles[i] = new StringBuilder(); > > } > > this.maxShingleSize = maxShingleSize; > > } > > > > /** > > * Clear the StringBuilder shingles that are used for storing the > output shingles. > > */ > > private void ClearShingles() > > { > > if (shingles != null) > > { > > for (int i = 0; i < shingles.Length; i++) > > { > > shingles[i].Length = 0; > > } > > } > > } > > > > private AttributeSource.State nextToken; > > private int shingleBufferPosition; > > private int[] endOffsets; > > > > /* (non-Javadoc) > > * @see org.apache.lucene.analysis.TokenStream#next() > > */ > > public override bool IncrementToken() > > { > > try > > { > > while (true) > > { > > if (nextToken == null) > > { > > if (!FillShingleBuffer()) > > { > > return false; > > } > > } > > > > IEnumerator it = shingleBuf.GetEnumerator(); > > it.MoveNext(); > > nextToken = (AttributeSource.State)it.Current; > > > > if (outputUnigrams) > > { > > if (shingleBufferPosition == 0) > > { > > base.RestoreState(nextToken); > > posIncrAtt.SetPositionIncrement(1); > > shingleBufferPosition++; > > return true; > > } > > } > > else if (shingleBufferPosition % this.maxShingleSize == > 0) > > { > > shingleBufferPosition++; > > } > > > > if (shingleBufferPosition < shingleBuf.Count) > > { > > base.RestoreState(nextToken); > > typeAtt.SetType(tokenType); > > offsetAtt.SetOffset(offsetAtt.StartOffset(), > endOffsets[shingleBufferPosition]); > > StringBuilder buf = shingles[shingleBufferPosition]; > > int termLength = buf.Length; > > char[] termBuffer = termAtt.TermBuffer(); > > if (termBuffer.Length < termLength) > > { > > termBuffer = > termAtt.ResizeTermBuffer(termLength); > > } > > > > termBuffer = buf.ToString().ToCharArray(0, > termLength); > > > > termAtt.SetTermLength(termLength); > > if ((!outputUnigrams) && shingleBufferPosition % > this.maxShingleSize == 1) > > { > > posIncrAtt.SetPositionIncrement(1); > > } > > else > > { > > posIncrAtt.SetPositionIncrement(0); > > } > > shingleBufferPosition++; > > if (shingleBufferPosition == shingleBuf.Count) > > { > > nextToken = null; > > shingleBufferPosition = 0; > > } > > return true; > > } > > else > > { > > nextToken = null; > > shingleBufferPosition = 0; > > } > > } > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > private int numFillerTokensToInsert; > > private AttributeSource.State currentToken; > > private bool hasCurrentToken; > > > > /** > > * Get the next token from the input stream and push it on the token > buffer. > > * If we encounter a token with position increment > 1, we put > filler tokens > > * on the token buffer. > > * <p/> > > * Returns null when the end of the input stream is reached. > > * @return the next token, or null if at end of input stream > > * @throws IOException if the input stream has a problem > > */ > > private bool getNextToken() > > { > > try > > { > > while (true) > > { > > if (numFillerTokensToInsert > 0) > > { > > if (currentToken == null) > > { > > currentToken = CaptureState(); > > } > > else > > { > > RestoreState(currentToken); > > } > > numFillerTokensToInsert--; > > // A filler token occupies no space > > offsetAtt.SetOffset(offsetAtt.StartOffset(), > offsetAtt.StartOffset()); > > termAtt.SetTermBuffer(FILLER_TOKEN, 0, > FILLER_TOKEN.Length); > > return true; > > } > > > > if (hasCurrentToken) > > { > > if (currentToken != null) > > { > > RestoreState(currentToken); > > currentToken = null; > > } > > hasCurrentToken = false; > > return true; > > } > > > > if (!input.IncrementToken()) > > { > > return false; > > } > > hasCurrentToken = true; > > > > if (posIncrAtt.GetPositionIncrement() > 1) > > { > > numFillerTokensToInsert = > posIncrAtt.GetPositionIncrement() - 1; > > } > > } > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > /** > > * Fill the output buffer with new shingles. > > * > > * @throws IOException if there's a problem getting the next token > > */ > > private bool FillShingleBuffer() > > { > > try > > { > > bool addedToken = false; > > /* Try to fill the shingle buffer. */ > > do > > { > > if (getNextToken()) > > { > > shingleBuf.Add(CaptureState()); > > if (shingleBuf.Count > maxShingleSize) > > { > > shingleBuf.RemoveAt(0); > > } > > addedToken = true; > > } > > else > > { > > break; > > } > > } while (shingleBuf.Count < maxShingleSize); > > > > if (shingleBuf.Count < 1) > > { > > return false; > > } > > > > /* > > * If no new token could be added to the shingle buffer, we > have reached > > * the end of the input stream and have to discard the least > recent token. > > */ > > if (!addedToken) > > { > > shingleBuf.RemoveAt(0); > > } > > > > if (shingleBuf.Count < 1) > > { > > return false; > > > > } > > > > ClearShingles(); > > int i; > > endOffsets = new int[shingleBuf.Count]; > > for (i = 0; i < endOffsets.Length; i++) > > { > > endOffsets[i] = 0; > > } > > > > i = 0; > > for (IEnumerator it = shingleBuf.GetEnumerator(); > it.MoveNext(); ) > > { > > RestoreState((AttributeSource.State)it.Current); > > for (int j = i; j < shingles.Length; j++) > > { > > if (shingles[j].Length != 0) > > { > > shingles[j].Append(TOKEN_SEPARATOR); > > } > > shingles[j].Append(termAtt.TermBuffer(), 0, > termAtt.TermLength()); > > } > > > > endOffsets[i] = offsetAtt.EndOffset(); > > i++; > > } > > > > return true; > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > > > /** @deprecated Will be removed in Lucene 3.0. This method is final, > as it should > > * not be overridden. Delegates to the backwards compatibility layer. > */ > > public sealed Token Next(Token reusableToken) > > { > > try > > { > > return base.Next(reusableToken); > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > /** @deprecated Will be removed in Lucene 3.0. This method is final, > as it should > > * not be overridden. Delegates to the backwards compatibility layer. > */ > > public Token Next() > > { > > try > > { > > return base.Next(); > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > } > > > > > > public override void Reset() > > { > > try > > { > > base.Reset(); > > nextToken = null; > > shingleBufferPosition = 0; > > shingleBuf.Clear(); > > numFillerTokensToInsert = 0; > > currentToken = null; > > hasCurrentToken = false; > > } > > catch (System.Exception e) > > { > > throw new System.SystemException(e.ToString()); > > } > > > > } > > } > > } >