https://builds.apache.org/view/L/view/Lucene/job/PreCommit-SOLR-Build/392/
On Tue, Apr 30, 2019 at 2:37 PM Kevin Risden <[email protected]> wrote: > It might be https://issues.apache.org/jira/browse/LUCENE-8756 > > Kevin Risden > > > On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <[email protected]> wrote: > >> I'm seeing precommit failures on master that appear to be from this >> commit. Also it's not clear from the commit message which issue this >> belongs to... >> >> [forbidden-apis] Loading classes to check... >> [forbidden-apis] Scanning classes for violations... >> [forbidden-apis] Forbidden method invocation: >> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default >> locale] >> [forbidden-apis] in org.apache.lucene.queries.mlt.TestMoreLikeThis >> (TestMoreLikeThis.java:497) >> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations >> (in 0.08s), 1 error(s). >> >> >> On Tue, Apr 30, 2019 at 12:16 PM <[email protected]> wrote: >> >>> This is an automated email from the ASF dual-hosted git repository. >>> >>> mikemccand pushed a commit to branch master >>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git >>> >>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636 >>> Author: Olli Kuonanoja <[email protected]> >>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300 >>> >>> Fix MLT like text with custom frequencies >>> >>> When an analyzer with custom term frequencies is used with MLT like >>> texts, the custom term frequencies are incorrectly omitted and a >>> fixed >>> frequency of 1 is used instead. >>> >>> This commit fixes the issue by using `TermFrequencyAttribute` to get >>> the term frequencies instead of using fixed 1. Also adds test cases >>> for them mentioned issue. >>> --- >>> .../apache/lucene/queries/mlt/MoreLikeThis.java | 12 +++- >>> .../lucene/queries/mlt/TestMoreLikeThis.java | 70 >>> ++++++++++++++++++++++ >>> 2 files changed, 79 insertions(+), 3 deletions(-) >>> >>> diff --git >>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>> index 61ebe93..7c077e5 100644 >>> --- >>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>> +++ >>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java >>> @@ -28,6 +28,7 @@ import java.util.Set; >>> import org.apache.lucene.analysis.Analyzer; >>> import org.apache.lucene.analysis.TokenStream; >>> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >>> +import >>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; >>> import org.apache.lucene.document.Document; >>> import org.apache.lucene.index.FieldInfos; >>> import org.apache.lucene.index.Fields; >>> @@ -824,6 +825,7 @@ public final class MoreLikeThis { >>> int tokenCount = 0; >>> // for every token >>> CharTermAttribute termAtt = >>> ts.addAttribute(CharTermAttribute.class); >>> + TermFrequencyAttribute tfAtt = >>> ts.addAttribute(TermFrequencyAttribute.class); >>> ts.reset(); >>> while (ts.incrementToken()) { >>> String word = termAtt.toString(); >>> @@ -838,9 +840,9 @@ public final class MoreLikeThis { >>> // increment frequency >>> Int cnt = termFreqMap.get(word); >>> if (cnt == null) { >>> - termFreqMap.put(word, new Int()); >>> + termFreqMap.put(word, new Int(tfAtt.getTermFrequency())); >>> } else { >>> - cnt.x++; >>> + cnt.x += tfAtt.getTermFrequency(); >>> } >>> } >>> ts.end(); >>> @@ -982,7 +984,11 @@ public final class MoreLikeThis { >>> int x; >>> >>> Int() { >>> - x = 1; >>> + this(1); >>> + } >>> + >>> + Int(int initialValue) { >>> + x = initialValue; >>> } >>> } >>> } >>> diff --git >>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>> index 4a60015..aeec534 100644 >>> --- >>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>> +++ >>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java >>> @@ -27,7 +27,12 @@ import java.util.Map; >>> >>> import org.apache.lucene.analysis.Analyzer; >>> import org.apache.lucene.analysis.MockAnalyzer; >>> +import org.apache.lucene.analysis.MockTokenFilter; >>> import org.apache.lucene.analysis.MockTokenizer; >>> +import org.apache.lucene.analysis.TokenFilter; >>> +import org.apache.lucene.analysis.TokenStream; >>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; >>> +import >>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; >>> import org.apache.lucene.document.Document; >>> import org.apache.lucene.document.Field; >>> import org.apache.lucene.index.IndexReader; >>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query; >>> import org.apache.lucene.search.QueryUtils; >>> import org.apache.lucene.search.TermQuery; >>> import org.apache.lucene.store.Directory; >>> +import org.apache.lucene.util.ArrayUtil; >>> import org.apache.lucene.util.LuceneTestCase; >>> >>> import static org.hamcrest.core.Is.is; >>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends >>> LuceneTestCase { >>> analyzer.close(); >>> } >>> } >>> + >>> + public void testCustomFrequecy() throws IOException { >>> + // define an analyzer with delimited term frequency, e.g. "foo|2 >>> bar|3" >>> + Analyzer analyzer = new Analyzer() { >>> + >>> + @Override >>> + protected TokenStreamComponents createComponents(String >>> fieldName) { >>> + MockTokenizer tokenizer = new >>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100); >>> + MockTokenFilter filt = new MockTokenFilter(tokenizer, >>> MockTokenFilter.EMPTY_STOPSET); >>> + return new TokenStreamComponents(tokenizer, >>> addCustomTokenFilter(filt)); >>> + } >>> + >>> + TokenStream addCustomTokenFilter(TokenStream input) { >>> + return new TokenFilter(input) { >>> + final CharTermAttribute termAtt = >>> addAttribute(CharTermAttribute.class); >>> + final TermFrequencyAttribute tfAtt = >>> addAttribute(TermFrequencyAttribute.class); >>> + >>> + @Override >>> + public boolean incrementToken() throws IOException { >>> + if (input.incrementToken()) { >>> + final char[] buffer = termAtt.buffer(); >>> + final int length = termAtt.length(); >>> + for (int i = 0; i < length; i++) { >>> + if (buffer[i] == '|') { >>> + termAtt.setLength(i); >>> + i++; >>> + tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, >>> length - i)); >>> + return true; >>> + } >>> + } >>> + return true; >>> + } >>> + return false; >>> + } >>> + }; >>> + } >>> + }; >>> + >>> + mlt.setAnalyzer(analyzer); >>> + mlt.setFieldNames(new String[] {"text"}); >>> + mlt.setBoost(true); >>> + >>> + final double boost10 = ((BooleanQuery) mlt.like("text", new >>> StringReader("lucene|10 release|1"))) >>> + .clauses() >>> + .stream() >>> + .map(BooleanClause::getQuery) >>> + .map(BoostQuery.class::cast) >>> + .filter(x -> ((TermQuery) >>> x.getQuery()).getTerm().text().equals("lucene")) >>> + .mapToDouble(BoostQuery::getBoost) >>> + .sum(); >>> + >>> + final double boost1 = ((BooleanQuery) mlt.like("text", new >>> StringReader("lucene|1 release|1"))) >>> + .clauses() >>> + .stream() >>> + .map(BooleanClause::getQuery) >>> + .map(BoostQuery.class::cast) >>> + .filter(x -> ((TermQuery) >>> x.getQuery()).getTerm().text().equals("lucene")) >>> + .mapToDouble(BoostQuery::getBoost) >>> + .sum(); >>> + >>> + // mlt should use the custom frequencies provided by the analyzer >>> so "lucene|10" should be boosted more than "lucene|1" >>> + assertTrue(String.format("%s should be grater than %s", boost10, >>> boost1), boost10 > boost1); >>> + } >>> + >>> // TODO: add tests for the MoreLikeThisQuery >>> } >>> >>> >> >> -- >> http://www.needhamsoftware.com (work) >> http://www.the111shift.com (play) >> > -- http://www.needhamsoftware.com (work) http://www.the111shift.com (play)
