Oh, I see, it’s because TokenFilter extends AttributeSource, so the import is unnecessary. Will push a fix as part of LUCENE-7623.
Alan Woodward www.flax.co.uk > On 16 Jan 2017, at 11:26, Alan Woodward <[email protected]> wrote: > > This is making precommit fail for me locally: > > -ecj-javadoc-lint-src: > [mkdir] Created dir: > /var/folders/16/hgq2wtys7nv1_x9st6mdpwzh0000gp/T/ecj662445789 > [ecj-lint] Compiling 453 source files to > /var/folders/16/hgq2wtys7nv1_x9st6mdpwzh0000gp/T/ecj662445789 > [ecj-lint] ---------- > [ecj-lint] 1. ERROR in > /Users/woody/asf/lucene-solr-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java > (at line 26) > [ecj-lint] import org.apache.lucene.util.AttributeSource.State; > [ecj-lint] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > [ecj-lint] The import org.apache.lucene.util.AttributeSource.State is never > used > [ecj-lint] ————— > > Which is confusing as hell, because the import clearly *is* used. And > removing the import fixes things, even though it shouldn’t then compile. > > Alan Woodward > www.flax.co.uk <http://www.flax.co.uk/> > > >> On 16 Jan 2017, at 10:27, [email protected] >> <mailto:[email protected]> wrote: >> >> Repository: lucene-solr >> Updated Branches: >> refs/heads/branch_6x b5b17b23c -> a69c632aa >> >> >> LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and >> preserve all attributes >> [merge branch 'edgepayloads' from Nathan Gass >> https://github.com/xabbu42/lucene-solr] >> <https://github.com/xabbu42/lucene-solr]> >> >> Signed-off-by: Uwe Schindler <[email protected] >> <mailto:[email protected]>> >> >> >> Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo >> <http://git-wip-us.apache.org/repos/asf/lucene-solr/repo> >> Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/a69c632a >> <http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/a69c632a> >> Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/a69c632a >> <http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/a69c632a> >> Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/a69c632a >> <http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/a69c632a> >> >> Branch: refs/heads/branch_6x >> Commit: a69c632aa54d064515152145bcbcbe1e869d7061 >> Parents: b5b17b2 >> Author: Uwe Schindler <[email protected] <mailto:[email protected]>> >> Authored: Mon Jan 16 11:16:43 2017 +0100 >> Committer: Uwe Schindler <[email protected] >> <mailto:[email protected]>> >> Committed: Mon Jan 16 11:24:55 2017 +0100 >> >> ---------------------------------------------------------------------- >> lucene/CHANGES.txt | 7 +++ >> .../analysis/ngram/EdgeNGramTokenFilter.java | 17 ++----- >> .../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++----- >> .../lucene/analysis/ngram/TestNGramFilters.java | 47 ++++++++++++++++++++ >> 4 files changed, 63 insertions(+), 27 deletions(-) >> ---------------------------------------------------------------------- >> >> >> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/CHANGES.txt >> >> <http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/CHANGES.txt> >> ---------------------------------------------------------------------- >> diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt >> index 5de3bab..af0ff77 100644 >> --- a/lucene/CHANGES.txt >> +++ b/lucene/CHANGES.txt >> @@ -6,6 +6,13 @@ http://s.apache.org/luceneversions >> ======================= Lucene 6.5.0 ======================= >> (No Changes) >> >> +======================= Lucene 6.5.0 ======================= >> + >> +Bug Fixes >> + >> +* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads >> + and preserve all attributes. (Nathan Gass via Uwe Schindler) >> + >> ======================= Lucene 6.4.0 ======================= >> >> API Changes >> >> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java >> ---------------------------------------------------------------------- >> diff --git >> a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java >> >> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java >> index 827e26f..47b80ff 100644 >> --- >> a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java >> +++ >> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java >> @@ -22,9 +22,8 @@ import java.io.IOException; >> import org.apache.lucene.analysis.TokenFilter; >> import org.apache.lucene.analysis.TokenStream; >> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >> -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; >> import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; >> -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; >> +import org.apache.lucene.util.AttributeSource.State; >> >> /** >> * Tokenizes the given token into n-grams of given size(s). >> @@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends >> TokenFilter { >> private int curTermLength; >> private int curCodePointCount; >> private int curGramSize; >> - private int tokStart; >> - private int tokEnd; // only used if the length changed before this filter >> private int savePosIncr; >> - private int savePosLen; >> + private State state; >> >> private final CharTermAttribute termAtt = >> addAttribute(CharTermAttribute.class); >> - private final OffsetAttribute offsetAtt = >> addAttribute(OffsetAttribute.class); >> private final PositionIncrementAttribute posIncrAtt = >> addAttribute(PositionIncrementAttribute.class); >> - private final PositionLengthAttribute posLenAtt = >> addAttribute(PositionLengthAttribute.class); >> >> /** >> * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of >> the given range >> @@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends >> TokenFilter { >> curTermLength = termAtt.length(); >> curCodePointCount = Character.codePointCount(termAtt, 0, >> termAtt.length()); >> curGramSize = minGram; >> - tokStart = offsetAtt.startOffset(); >> - tokEnd = offsetAtt.endOffset(); >> + state = captureState(); >> savePosIncr += posIncrAtt.getPositionIncrement(); >> - savePosLen = posLenAtt.getPositionLength(); >> } >> } >> if (curGramSize <= maxGram) { // if we have hit the end of our >> n-gram size range, quit >> if (curGramSize <= curCodePointCount) { // if the remaining input is >> too short, we can't generate any n-grams >> // grab gramSize chars from front or back >> - clearAttributes(); >> - offsetAtt.setOffset(tokStart, tokEnd); >> + restoreState(state); >> // first ngram gets increment, others don't >> if (curGramSize == minGram) { >> posIncrAtt.setPositionIncrement(savePosIncr); >> @@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends >> TokenFilter { >> } else { >> posIncrAtt.setPositionIncrement(0); >> } >> - posLenAtt.setPositionLength(savePosLen); >> final int charLength = Character.offsetByCodePoints(curTermBuffer, >> 0, curTermLength, 0, curGramSize); >> termAtt.copyBuffer(curTermBuffer, 0, charLength); >> curGramSize++; >> >> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java >> ---------------------------------------------------------------------- >> diff --git >> a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java >> >> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java >> index e275cfa..cb5d447 100644 >> --- >> a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java >> +++ >> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java >> @@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter; >> import org.apache.lucene.analysis.TokenStream; >> import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; >> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >> -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; >> import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; >> -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; >> +import org.apache.lucene.util.AttributeSource.State; >> >> /** >> * Tokenizes the input into n-grams of the given size(s). >> @@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter { >> private int curCodePointCount; >> private int curGramSize; >> private int curPos; >> - private int curPosInc, curPosLen; >> - private int tokStart; >> - private int tokEnd; >> + private int curPosInc; >> + private State state; >> >> private final CharTermAttribute termAtt = >> addAttribute(CharTermAttribute.class); >> private final PositionIncrementAttribute posIncAtt; >> - private final PositionLengthAttribute posLenAtt; >> - private final OffsetAttribute offsetAtt = >> addAttribute(OffsetAttribute.class); >> >> /** >> * Creates NGramTokenFilter with given min and max n-grams. >> @@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter { >> this.maxGram = maxGram; >> >> posIncAtt = addAttribute(PositionIncrementAttribute.class); >> - posLenAtt = addAttribute(PositionLengthAttribute.class); >> } >> >> /** >> @@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter { >> curGramSize = minGram; >> curPos = 0; >> curPosInc = posIncAtt.getPositionIncrement(); >> - curPosLen = posLenAtt.getPositionLength(); >> - tokStart = offsetAtt.startOffset(); >> - tokEnd = offsetAtt.endOffset(); >> + state = captureState(); >> } >> } >> >> @@ -115,14 +108,12 @@ public final class NGramTokenFilter extends >> TokenFilter { >> curGramSize = minGram; >> } >> if ((curPos + curGramSize) <= curCodePointCount) { >> - clearAttributes(); >> + restoreState(state); >> final int start = Character.offsetByCodePoints(curTermBuffer, 0, >> curTermLength, 0, curPos); >> final int end = Character.offsetByCodePoints(curTermBuffer, 0, >> curTermLength, start, curGramSize); >> termAtt.copyBuffer(curTermBuffer, start, end - start); >> posIncAtt.setPositionIncrement(curPosInc); >> curPosInc = 0; >> - posLenAtt.setPositionLength(curPosLen); >> - offsetAtt.setOffset(tokStart, tokEnd); >> curGramSize++; >> return true; >> } >> >> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java >> ---------------------------------------------------------------------- >> diff --git >> a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java >> >> b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java >> index 1243352..5de532f 100644 >> --- >> a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java >> +++ >> b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java >> @@ -22,7 +22,10 @@ import java.io.StringReader; >> >> import org.apache.lucene.analysis.TokenStream; >> import org.apache.lucene.analysis.Tokenizer; >> +import org.apache.lucene.analysis.payloads.PayloadHelper; >> +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; >> import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; >> +import org.apache.lucene.util.BytesRef; >> >> /** >> * Simple tests to ensure the NGram filter factories are working. >> @@ -77,6 +80,28 @@ public class TestNGramFilters extends >> BaseTokenStreamFactoryTestCase { >> } >> >> /** >> + * Test NGramFilterFactory on tokens with payloads >> + */ >> + public void testNGramFilterPayload() throws Exception { >> + Reader reader = new StringReader("test|0.1"); >> + TokenStream stream = whitespaceMockTokenizer(reader); >> + stream = tokenFilterFactory("DelimitedPayload", "encoder", >> "float").create(stream); >> + stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", >> "2").create(stream); >> + >> + stream.reset(); >> + while (stream.incrementToken()) { >> + PayloadAttribute payAttr = >> stream.getAttribute(PayloadAttribute.class); >> + assertNotNull(payAttr); >> + BytesRef payData = payAttr.getPayload(); >> + assertNotNull(payData); >> + float payFloat = PayloadHelper.decodeFloat(payData.bytes); >> + assertEquals(0.1f, payFloat, 0.0f); >> + } >> + stream.end(); >> + stream.close(); >> + } >> + >> + /** >> * Test EdgeNGramTokenizerFactory >> */ >> public void testEdgeNGramTokenizer() throws Exception { >> @@ -123,6 +148,28 @@ public class TestNGramFilters extends >> BaseTokenStreamFactoryTestCase { >> assertTokenStreamContents(stream, >> new String[] { "t", "te" }); >> } >> + >> + /** >> + * Test EdgeNGramFilterFactory on tokens with payloads >> + */ >> + public void testEdgeNGramFilterPayload() throws Exception { >> + Reader reader = new StringReader("test|0.1"); >> + TokenStream stream = whitespaceMockTokenizer(reader); >> + stream = tokenFilterFactory("DelimitedPayload", "encoder", >> "float").create(stream); >> + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", >> "maxGramSize", "2").create(stream); >> + >> + stream.reset(); >> + while (stream.incrementToken()) { >> + PayloadAttribute payAttr = >> stream.getAttribute(PayloadAttribute.class); >> + assertNotNull(payAttr); >> + BytesRef payData = payAttr.getPayload(); >> + assertNotNull(payData); >> + float payFloat = PayloadHelper.decodeFloat(payData.bytes); >> + assertEquals(0.1f, payFloat, 0.0f); >> + } >> + stream.end(); >> + stream.close(); >> + } >> >> /** Test that bogus arguments result in exception */ >> public void testBogusArguments() throws Exception { >> >
