I'm sorry about this -- I saw this failure locally, thought I fixed it before pushing, but obviously failed to. I blame git ;)
Thank you to Christine for stepping in and fixing it! Mike McCandless http://blog.mikemccandless.com On Tue, Apr 30, 2019 at 3:33 PM Gus Heck <gus.h...@gmail.com> wrote: > https://builds.apache.org/view/L/view/Lucene/job/PreCommit-SOLR-Build/392/ > > On Tue, Apr 30, 2019 at 2:37 PM Kevin Risden <kris...@apache.org> wrote: > >> It might be https://issues.apache.org/jira/browse/LUCENE-8756 >> >> Kevin Risden >> >> >> On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <gus.h...@gmail.com> wrote: >> >>> I'm seeing precommit failures on master that appear to be from this >>> commit. Also it's not clear from the commit message which issue this >>> belongs to... >>> >>> [forbidden-apis] Loading classes to check... >>> [forbidden-apis] Scanning classes for violations... >>> [forbidden-apis] Forbidden method invocation: >>> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default >>> locale] >>> [forbidden-apis] in org.apache.lucene.queries.mlt.TestMoreLikeThis >>> (TestMoreLikeThis.java:497) >>> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations >>> (in 0.08s), 1 error(s). >>> >>> >>> On Tue, Apr 30, 2019 at 12:16 PM <mikemcc...@apache.org> wrote: >>> >>>> This is an automated email from the ASF dual-hosted git repository. >>>> >>>> mikemccand pushed a commit to branch master >>>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git >>>> >>>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636 >>>> Author: Olli Kuonanoja <olli.kuonan...@nosto.com> >>>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300 >>>> >>>> Fix MLT like text with custom frequencies >>>> >>>> When an analyzer with custom term frequencies is used with MLT like >>>> texts, the custom term frequencies are incorrectly omitted and a >>>> fixed >>>> frequency of 1 is used instead. >>>> >>>> This commit fixes the issue by using `TermFrequencyAttribute` to get >>>> the term frequencies instead of using fixed 1. Also adds test cases >>>> for them mentioned issue. >>>> --- >>>> .../apache/lucene/queries/mlt/MoreLikeThis.java | 12 +++- >>>> .../lucene/queries/mlt/TestMoreLikeThis.java | 70 >>>> ++++++++++++++++++++++ >>>> 2 files changed, 79 insertions(+), 3 deletions(-) >>>> >>>> diff --git >>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>>> index 61ebe93..7c077e5 100644 >>>> --- >>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>>> +++ >>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>>> @@ -28,6 +28,7 @@ import java.util.Set; >>>> import org.apache.lucene.analysis.Analyzer; >>>> import org.apache.lucene.analysis.TokenStream; >>>> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >>>> +import >>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; >>>> import org.apache.lucene.document.Document; >>>> import org.apache.lucene.index.FieldInfos; >>>> import org.apache.lucene.index.Fields; >>>> @@ -824,6 +825,7 @@ public final class MoreLikeThis { >>>> int tokenCount = 0; >>>> // for every token >>>> CharTermAttribute termAtt = >>>> ts.addAttribute(CharTermAttribute.class); >>>> + TermFrequencyAttribute tfAtt = >>>> ts.addAttribute(TermFrequencyAttribute.class); >>>> ts.reset(); >>>> while (ts.incrementToken()) { >>>> String word = termAtt.toString(); >>>> @@ -838,9 +840,9 @@ public final class MoreLikeThis { >>>> // increment frequency >>>> Int cnt = termFreqMap.get(word); >>>> if (cnt == null) { >>>> - termFreqMap.put(word, new Int()); >>>> + termFreqMap.put(word, new Int(tfAtt.getTermFrequency())); >>>> } else { >>>> - cnt.x++; >>>> + cnt.x += tfAtt.getTermFrequency(); >>>> } >>>> } >>>> ts.end(); >>>> @@ -982,7 +984,11 @@ public final class MoreLikeThis { >>>> int x; >>>> >>>> Int() { >>>> - x = 1; >>>> + this(1); >>>> + } >>>> + >>>> + Int(int initialValue) { >>>> + x = initialValue; >>>> } >>>> } >>>> } >>>> diff --git >>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>>> index 4a60015..aeec534 100644 >>>> --- >>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>>> +++ >>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>>> @@ -27,7 +27,12 @@ import java.util.Map; >>>> >>>> import org.apache.lucene.analysis.Analyzer; >>>> import org.apache.lucene.analysis.MockAnalyzer; >>>> +import org.apache.lucene.analysis.MockTokenFilter; >>>> import org.apache.lucene.analysis.MockTokenizer; >>>> +import org.apache.lucene.analysis.TokenFilter; >>>> +import org.apache.lucene.analysis.TokenStream; >>>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >>>> +import >>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; >>>> import org.apache.lucene.document.Document; >>>> import org.apache.lucene.document.Field; >>>> import org.apache.lucene.index.IndexReader; >>>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query; >>>> import org.apache.lucene.search.QueryUtils; >>>> import org.apache.lucene.search.TermQuery; >>>> import org.apache.lucene.store.Directory; >>>> +import org.apache.lucene.util.ArrayUtil; >>>> import org.apache.lucene.util.LuceneTestCase; >>>> >>>> import static org.hamcrest.core.Is.is; >>>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends >>>> LuceneTestCase { >>>> analyzer.close(); >>>> } >>>> } >>>> + >>>> + public void testCustomFrequecy() throws IOException { >>>> + // define an analyzer with delimited term frequency, e.g. "foo|2 >>>> bar|3" >>>> + Analyzer analyzer = new Analyzer() { >>>> + >>>> + @Override >>>> + protected TokenStreamComponents createComponents(String >>>> fieldName) { >>>> + MockTokenizer tokenizer = new >>>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100); >>>> + MockTokenFilter filt = new MockTokenFilter(tokenizer, >>>> MockTokenFilter.EMPTY_STOPSET); >>>> + return new TokenStreamComponents(tokenizer, >>>> addCustomTokenFilter(filt)); >>>> + } >>>> + >>>> + TokenStream addCustomTokenFilter(TokenStream input) { >>>> + return new TokenFilter(input) { >>>> + final CharTermAttribute termAtt = >>>> addAttribute(CharTermAttribute.class); >>>> + final TermFrequencyAttribute tfAtt = >>>> addAttribute(TermFrequencyAttribute.class); >>>> + >>>> + @Override >>>> + public boolean incrementToken() throws IOException { >>>> + if (input.incrementToken()) { >>>> + final char[] buffer = termAtt.buffer(); >>>> + final int length = termAtt.length(); >>>> + for (int i = 0; i < length; i++) { >>>> + if (buffer[i] == '|') { >>>> + termAtt.setLength(i); >>>> + i++; >>>> + tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, >>>> length - i)); >>>> + return true; >>>> + } >>>> + } >>>> + return true; >>>> + } >>>> + return false; >>>> + } >>>> + }; >>>> + } >>>> + }; >>>> + >>>> + mlt.setAnalyzer(analyzer); >>>> + mlt.setFieldNames(new String[] {"text"}); >>>> + mlt.setBoost(true); >>>> + >>>> + final double boost10 = ((BooleanQuery) mlt.like("text", new >>>> StringReader("lucene|10 release|1"))) >>>> + .clauses() >>>> + .stream() >>>> + .map(BooleanClause::getQuery) >>>> + .map(BoostQuery.class::cast) >>>> + .filter(x -> ((TermQuery) >>>> x.getQuery()).getTerm().text().equals("lucene")) >>>> + .mapToDouble(BoostQuery::getBoost) >>>> + .sum(); >>>> + >>>> + final double boost1 = ((BooleanQuery) mlt.like("text", new >>>> StringReader("lucene|1 release|1"))) >>>> + .clauses() >>>> + .stream() >>>> + .map(BooleanClause::getQuery) >>>> + .map(BoostQuery.class::cast) >>>> + .filter(x -> ((TermQuery) >>>> x.getQuery()).getTerm().text().equals("lucene")) >>>> + .mapToDouble(BoostQuery::getBoost) >>>> + .sum(); >>>> + >>>> + // mlt should use the custom frequencies provided by the analyzer >>>> so "lucene|10" should be boosted more than "lucene|1" >>>> + assertTrue(String.format("%s should be grater than %s", boost10, >>>> boost1), boost10 > boost1); >>>> + } >>>> + >>>> // TODO: add tests for the MoreLikeThisQuery >>>> } >>>> >>>> >>> >>> -- >>> http://www.needhamsoftware.com (work) >>> http://www.the111shift.com (play) >>> >> > > -- > http://www.needhamsoftware.com (work) > http://www.the111shift.com (play) >