Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Gus Heck Tue, 30 Apr 2019 12:34:13 -0700

https://builds.apache.org/view/L/view/Lucene/job/PreCommit-SOLR-Build/392/


On Tue, Apr 30, 2019 at 2:37 PM Kevin Risden <[email protected]> wrote:

> It might be https://issues.apache.org/jira/browse/LUCENE-8756
>
> Kevin Risden
>
>
> On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <[email protected]> wrote:
>
>> I'm seeing precommit failures on master that appear to be from this
>> commit. Also it's not clear from the commit message which issue this
>> belongs to...
>>
>> [forbidden-apis] Loading classes to check...
>> [forbidden-apis] Scanning classes for violations...
>> [forbidden-apis] Forbidden method invocation:
>> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default
>> locale]
>> [forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis
>> (TestMoreLikeThis.java:497)
>> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations
>> (in 0.08s), 1 error(s).
>>
>>
>> On Tue, Apr 30, 2019 at 12:16 PM <[email protected]> wrote:
>>
>>> This is an automated email from the ASF dual-hosted git repository.
>>>
>>> mikemccand pushed a commit to branch master
>>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
>>>
>>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
>>> Author: Olli Kuonanoja <[email protected]>
>>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300
>>>
>>>     Fix MLT like text with custom frequencies
>>>
>>>     When an analyzer with custom term frequencies is used with MLT like
>>>     texts, the custom term frequencies are incorrectly omitted and a
>>> fixed
>>>     frequency of 1 is used instead.
>>>
>>>     This commit fixes the issue by using `TermFrequencyAttribute` to get
>>>     the term frequencies instead of using fixed 1. Also adds test cases
>>>     for them mentioned issue.
>>> ---
>>>  .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
>>>  .../lucene/queries/mlt/TestMoreLikeThis.java       | 70
>>> ++++++++++++++++++++++
>>>  2 files changed, 79 insertions(+), 3 deletions(-)
>>>
>>> diff --git
>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> index 61ebe93..7c077e5 100644
>>> ---
>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> +++
>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> @@ -28,6 +28,7 @@ import java.util.Set;
>>>  import org.apache.lucene.analysis.Analyzer;
>>>  import org.apache.lucene.analysis.TokenStream;
>>>  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>>> +import
>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>>  import org.apache.lucene.document.Document;
>>>  import org.apache.lucene.index.FieldInfos;
>>>  import org.apache.lucene.index.Fields;
>>> @@ -824,6 +825,7 @@ public final class MoreLikeThis {
>>>        int tokenCount = 0;
>>>        // for every token
>>>        CharTermAttribute termAtt =
>>> ts.addAttribute(CharTermAttribute.class);
>>> +      TermFrequencyAttribute tfAtt =
>>> ts.addAttribute(TermFrequencyAttribute.class);
>>>        ts.reset();
>>>        while (ts.incrementToken()) {
>>>          String word = termAtt.toString();
>>> @@ -838,9 +840,9 @@ public final class MoreLikeThis {
>>>          // increment frequency
>>>          Int cnt = termFreqMap.get(word);
>>>          if (cnt == null) {
>>> -          termFreqMap.put(word, new Int());
>>> +          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
>>>          } else {
>>> -          cnt.x++;
>>> +          cnt.x += tfAtt.getTermFrequency();
>>>          }
>>>        }
>>>        ts.end();
>>> @@ -982,7 +984,11 @@ public final class MoreLikeThis {
>>>      int x;
>>>
>>>      Int() {
>>> -      x = 1;
>>> +      this(1);
>>> +    }
>>> +
>>> +    Int(int initialValue) {
>>> +      x = initialValue;
>>>      }
>>>    }
>>>  }
>>> diff --git
>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> index 4a60015..aeec534 100644
>>> ---
>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> +++
>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> @@ -27,7 +27,12 @@ import java.util.Map;
>>>
>>>  import org.apache.lucene.analysis.Analyzer;
>>>  import org.apache.lucene.analysis.MockAnalyzer;
>>> +import org.apache.lucene.analysis.MockTokenFilter;
>>>  import org.apache.lucene.analysis.MockTokenizer;
>>> +import org.apache.lucene.analysis.TokenFilter;
>>> +import org.apache.lucene.analysis.TokenStream;
>>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>>> +import
>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>>  import org.apache.lucene.document.Document;
>>>  import org.apache.lucene.document.Field;
>>>  import org.apache.lucene.index.IndexReader;
>>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
>>>  import org.apache.lucene.search.QueryUtils;
>>>  import org.apache.lucene.search.TermQuery;
>>>  import org.apache.lucene.store.Directory;
>>> +import org.apache.lucene.util.ArrayUtil;
>>>  import org.apache.lucene.util.LuceneTestCase;
>>>
>>>  import static org.hamcrest.core.Is.is;
>>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends
>>> LuceneTestCase {
>>>        analyzer.close();
>>>      }
>>>    }
>>> +
>>> +  public void testCustomFrequecy() throws IOException {
>>> +    // define an analyzer with delimited term frequency, e.g. "foo|2
>>> bar|3"
>>> +    Analyzer analyzer = new Analyzer() {
>>> +
>>> +      @Override
>>> +      protected TokenStreamComponents createComponents(String
>>> fieldName) {
>>> +        MockTokenizer tokenizer = new
>>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
>>> +        MockTokenFilter filt = new MockTokenFilter(tokenizer,
>>> MockTokenFilter.EMPTY_STOPSET);
>>> +        return new TokenStreamComponents(tokenizer,
>>> addCustomTokenFilter(filt));
>>> +      }
>>> +
>>> +      TokenStream addCustomTokenFilter(TokenStream input) {
>>> +        return new TokenFilter(input) {
>>> +          final CharTermAttribute termAtt =
>>> addAttribute(CharTermAttribute.class);
>>> +          final TermFrequencyAttribute tfAtt =
>>> addAttribute(TermFrequencyAttribute.class);
>>> +
>>> +          @Override
>>> +          public boolean incrementToken() throws IOException {
>>> +            if (input.incrementToken()) {
>>> +              final char[] buffer = termAtt.buffer();
>>> +              final int length = termAtt.length();
>>> +              for (int i = 0; i < length; i++) {
>>> +                if (buffer[i] == '|') {
>>> +                  termAtt.setLength(i);
>>> +                  i++;
>>> +                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i,
>>> length - i));
>>> +                  return true;
>>> +                }
>>> +              }
>>> +              return true;
>>> +            }
>>> +            return false;
>>> +          }
>>> +        };
>>> +      }
>>> +    };
>>> +
>>> +    mlt.setAnalyzer(analyzer);
>>> +    mlt.setFieldNames(new String[] {"text"});
>>> +    mlt.setBoost(true);
>>> +
>>> +    final double boost10 = ((BooleanQuery) mlt.like("text", new
>>> StringReader("lucene|10 release|1")))
>>> +        .clauses()
>>> +        .stream()
>>> +        .map(BooleanClause::getQuery)
>>> +        .map(BoostQuery.class::cast)
>>> +        .filter(x -> ((TermQuery)
>>> x.getQuery()).getTerm().text().equals("lucene"))
>>> +        .mapToDouble(BoostQuery::getBoost)
>>> +        .sum();
>>> +
>>> +    final double boost1 = ((BooleanQuery) mlt.like("text", new
>>> StringReader("lucene|1 release|1")))
>>> +        .clauses()
>>> +        .stream()
>>> +        .map(BooleanClause::getQuery)
>>> +        .map(BoostQuery.class::cast)
>>> +        .filter(x -> ((TermQuery)
>>> x.getQuery()).getTerm().text().equals("lucene"))
>>> +        .mapToDouble(BoostQuery::getBoost)
>>> +        .sum();
>>> +
>>> +    // mlt should use the custom frequencies provided by the analyzer
>>> so "lucene|10" should be boosted more than "lucene|1"
>>> +    assertTrue(String.format("%s should be grater than %s", boost10,
>>> boost1), boost10 > boost1);
>>> +  }
>>> +
>>>    // TODO: add tests for the MoreLikeThisQuery
>>>  }
>>>
>>>
>>
>> --
>> http://www.needhamsoftware.com (work)
>> http://www.the111shift.com (play)
>>
>

-- 
http://www.needhamsoftware.com (work)
http://www.the111shift.com (play)

Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Reply via email to