It might be https://issues.apache.org/jira/browse/LUCENE-8756
Kevin Risden On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <[email protected]> wrote: > I'm seeing precommit failures on master that appear to be from this > commit. Also it's not clear from the commit message which issue this > belongs to... > > [forbidden-apis] Loading classes to check... > [forbidden-apis] Scanning classes for violations... > [forbidden-apis] Forbidden method invocation: > java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default > locale] > [forbidden-apis] in org.apache.lucene.queries.mlt.TestMoreLikeThis > (TestMoreLikeThis.java:497) > [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations > (in 0.08s), 1 error(s). > > > On Tue, Apr 30, 2019 at 12:16 PM <[email protected]> wrote: > >> This is an automated email from the ASF dual-hosted git repository. >> >> mikemccand pushed a commit to branch master >> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git >> >> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636 >> Author: Olli Kuonanoja <[email protected]> >> AuthorDate: Mon Apr 8 16:44:30 2019 +0300 >> >> Fix MLT like text with custom frequencies >> >> When an analyzer with custom term frequencies is used with MLT like >> texts, the custom term frequencies are incorrectly omitted and a fixed >> frequency of 1 is used instead. >> >> This commit fixes the issue by using `TermFrequencyAttribute` to get >> the term frequencies instead of using fixed 1. Also adds test cases >> for them mentioned issue. >> --- >> .../apache/lucene/queries/mlt/MoreLikeThis.java | 12 +++- >> .../lucene/queries/mlt/TestMoreLikeThis.java | 70 >> ++++++++++++++++++++++ >> 2 files changed, 79 insertions(+), 3 deletions(-) >> >> diff --git >> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >> index 61ebe93..7c077e5 100644 >> --- >> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >> +++ >> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >> @@ -28,6 +28,7 @@ import java.util.Set; >> import org.apache.lucene.analysis.Analyzer; >> import org.apache.lucene.analysis.TokenStream; >> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; >> import org.apache.lucene.document.Document; >> import org.apache.lucene.index.FieldInfos; >> import org.apache.lucene.index.Fields; >> @@ -824,6 +825,7 @@ public final class MoreLikeThis { >> int tokenCount = 0; >> // for every token >> CharTermAttribute termAtt = >> ts.addAttribute(CharTermAttribute.class); >> + TermFrequencyAttribute tfAtt = >> ts.addAttribute(TermFrequencyAttribute.class); >> ts.reset(); >> while (ts.incrementToken()) { >> String word = termAtt.toString(); >> @@ -838,9 +840,9 @@ public final class MoreLikeThis { >> // increment frequency >> Int cnt = termFreqMap.get(word); >> if (cnt == null) { >> - termFreqMap.put(word, new Int()); >> + termFreqMap.put(word, new Int(tfAtt.getTermFrequency())); >> } else { >> - cnt.x++; >> + cnt.x += tfAtt.getTermFrequency(); >> } >> } >> ts.end(); >> @@ -982,7 +984,11 @@ public final class MoreLikeThis { >> int x; >> >> Int() { >> - x = 1; >> + this(1); >> + } >> + >> + Int(int initialValue) { >> + x = initialValue; >> } >> } >> } >> diff --git >> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >> index 4a60015..aeec534 100644 >> --- >> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >> +++ >> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >> @@ -27,7 +27,12 @@ import java.util.Map; >> >> import org.apache.lucene.analysis.Analyzer; >> import org.apache.lucene.analysis.MockAnalyzer; >> +import org.apache.lucene.analysis.MockTokenFilter; >> import org.apache.lucene.analysis.MockTokenizer; >> +import org.apache.lucene.analysis.TokenFilter; >> +import org.apache.lucene.analysis.TokenStream; >> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; >> import org.apache.lucene.document.Document; >> import org.apache.lucene.document.Field; >> import org.apache.lucene.index.IndexReader; >> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query; >> import org.apache.lucene.search.QueryUtils; >> import org.apache.lucene.search.TermQuery; >> import org.apache.lucene.store.Directory; >> +import org.apache.lucene.util.ArrayUtil; >> import org.apache.lucene.util.LuceneTestCase; >> >> import static org.hamcrest.core.Is.is; >> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends LuceneTestCase >> { >> analyzer.close(); >> } >> } >> + >> + public void testCustomFrequecy() throws IOException { >> + // define an analyzer with delimited term frequency, e.g. "foo|2 >> bar|3" >> + Analyzer analyzer = new Analyzer() { >> + >> + @Override >> + protected TokenStreamComponents createComponents(String fieldName) >> { >> + MockTokenizer tokenizer = new >> MockTokenizer(MockTokenizer.WHITESPACE, false, 100); >> + MockTokenFilter filt = new MockTokenFilter(tokenizer, >> MockTokenFilter.EMPTY_STOPSET); >> + return new TokenStreamComponents(tokenizer, >> addCustomTokenFilter(filt)); >> + } >> + >> + TokenStream addCustomTokenFilter(TokenStream input) { >> + return new TokenFilter(input) { >> + final CharTermAttribute termAtt = >> addAttribute(CharTermAttribute.class); >> + final TermFrequencyAttribute tfAtt = >> addAttribute(TermFrequencyAttribute.class); >> + >> + @Override >> + public boolean incrementToken() throws IOException { >> + if (input.incrementToken()) { >> + final char[] buffer = termAtt.buffer(); >> + final int length = termAtt.length(); >> + for (int i = 0; i < length; i++) { >> + if (buffer[i] == '|') { >> + termAtt.setLength(i); >> + i++; >> + tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, >> length - i)); >> + return true; >> + } >> + } >> + return true; >> + } >> + return false; >> + } >> + }; >> + } >> + }; >> + >> + mlt.setAnalyzer(analyzer); >> + mlt.setFieldNames(new String[] {"text"}); >> + mlt.setBoost(true); >> + >> + final double boost10 = ((BooleanQuery) mlt.like("text", new >> StringReader("lucene|10 release|1"))) >> + .clauses() >> + .stream() >> + .map(BooleanClause::getQuery) >> + .map(BoostQuery.class::cast) >> + .filter(x -> ((TermQuery) >> x.getQuery()).getTerm().text().equals("lucene")) >> + .mapToDouble(BoostQuery::getBoost) >> + .sum(); >> + >> + final double boost1 = ((BooleanQuery) mlt.like("text", new >> StringReader("lucene|1 release|1"))) >> + .clauses() >> + .stream() >> + .map(BooleanClause::getQuery) >> + .map(BoostQuery.class::cast) >> + .filter(x -> ((TermQuery) >> x.getQuery()).getTerm().text().equals("lucene")) >> + .mapToDouble(BoostQuery::getBoost) >> + .sum(); >> + >> + // mlt should use the custom frequencies provided by the analyzer so >> "lucene|10" should be boosted more than "lucene|1" >> + assertTrue(String.format("%s should be grater than %s", boost10, >> boost1), boost10 > boost1); >> + } >> + >> // TODO: add tests for the MoreLikeThisQuery >> } >> >> > > -- > http://www.needhamsoftware.com (work) > http://www.the111shift.com (play) >
