Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Kevin Risden Tue, 30 Apr 2019 11:37:45 -0700

It might be https://issues.apache.org/jira/browse/LUCENE-8756


Kevin Risden


On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <[email protected]> wrote:

> I'm seeing precommit failures on master that appear to be from this
> commit. Also it's not clear from the commit message which issue this
> belongs to...
>
> [forbidden-apis] Loading classes to check...
> [forbidden-apis] Scanning classes for violations...
> [forbidden-apis] Forbidden method invocation:
> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default
> locale]
> [forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis
> (TestMoreLikeThis.java:497)
> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations
> (in 0.08s), 1 error(s).
>
>
> On Tue, Apr 30, 2019 at 12:16 PM <[email protected]> wrote:
>
>> This is an automated email from the ASF dual-hosted git repository.
>>
>> mikemccand pushed a commit to branch master
>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
>>
>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
>> Author: Olli Kuonanoja <[email protected]>
>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300
>>
>>     Fix MLT like text with custom frequencies
>>
>>     When an analyzer with custom term frequencies is used with MLT like
>>     texts, the custom term frequencies are incorrectly omitted and a fixed
>>     frequency of 1 is used instead.
>>
>>     This commit fixes the issue by using `TermFrequencyAttribute` to get
>>     the term frequencies instead of using fixed 1. Also adds test cases
>>     for them mentioned issue.
>> ---
>>  .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
>>  .../lucene/queries/mlt/TestMoreLikeThis.java       | 70
>> ++++++++++++++++++++++
>>  2 files changed, 79 insertions(+), 3 deletions(-)
>>
>> diff --git
>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> index 61ebe93..7c077e5 100644
>> ---
>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> +++
>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> @@ -28,6 +28,7 @@ import java.util.Set;
>>  import org.apache.lucene.analysis.Analyzer;
>>  import org.apache.lucene.analysis.TokenStream;
>>  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>  import org.apache.lucene.document.Document;
>>  import org.apache.lucene.index.FieldInfos;
>>  import org.apache.lucene.index.Fields;
>> @@ -824,6 +825,7 @@ public final class MoreLikeThis {
>>        int tokenCount = 0;
>>        // for every token
>>        CharTermAttribute termAtt =
>> ts.addAttribute(CharTermAttribute.class);
>> +      TermFrequencyAttribute tfAtt =
>> ts.addAttribute(TermFrequencyAttribute.class);
>>        ts.reset();
>>        while (ts.incrementToken()) {
>>          String word = termAtt.toString();
>> @@ -838,9 +840,9 @@ public final class MoreLikeThis {
>>          // increment frequency
>>          Int cnt = termFreqMap.get(word);
>>          if (cnt == null) {
>> -          termFreqMap.put(word, new Int());
>> +          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
>>          } else {
>> -          cnt.x++;
>> +          cnt.x += tfAtt.getTermFrequency();
>>          }
>>        }
>>        ts.end();
>> @@ -982,7 +984,11 @@ public final class MoreLikeThis {
>>      int x;
>>
>>      Int() {
>> -      x = 1;
>> +      this(1);
>> +    }
>> +
>> +    Int(int initialValue) {
>> +      x = initialValue;
>>      }
>>    }
>>  }
>> diff --git
>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> index 4a60015..aeec534 100644
>> ---
>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> +++
>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> @@ -27,7 +27,12 @@ import java.util.Map;
>>
>>  import org.apache.lucene.analysis.Analyzer;
>>  import org.apache.lucene.analysis.MockAnalyzer;
>> +import org.apache.lucene.analysis.MockTokenFilter;
>>  import org.apache.lucene.analysis.MockTokenizer;
>> +import org.apache.lucene.analysis.TokenFilter;
>> +import org.apache.lucene.analysis.TokenStream;
>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>  import org.apache.lucene.document.Document;
>>  import org.apache.lucene.document.Field;
>>  import org.apache.lucene.index.IndexReader;
>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
>>  import org.apache.lucene.search.QueryUtils;
>>  import org.apache.lucene.search.TermQuery;
>>  import org.apache.lucene.store.Directory;
>> +import org.apache.lucene.util.ArrayUtil;
>>  import org.apache.lucene.util.LuceneTestCase;
>>
>>  import static org.hamcrest.core.Is.is;
>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends LuceneTestCase
>> {
>>        analyzer.close();
>>      }
>>    }
>> +
>> +  public void testCustomFrequecy() throws IOException {
>> +    // define an analyzer with delimited term frequency, e.g. "foo|2
>> bar|3"
>> +    Analyzer analyzer = new Analyzer() {
>> +
>> +      @Override
>> +      protected TokenStreamComponents createComponents(String fieldName)
>> {
>> +        MockTokenizer tokenizer = new
>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
>> +        MockTokenFilter filt = new MockTokenFilter(tokenizer,
>> MockTokenFilter.EMPTY_STOPSET);
>> +        return new TokenStreamComponents(tokenizer,
>> addCustomTokenFilter(filt));
>> +      }
>> +
>> +      TokenStream addCustomTokenFilter(TokenStream input) {
>> +        return new TokenFilter(input) {
>> +          final CharTermAttribute termAtt =
>> addAttribute(CharTermAttribute.class);
>> +          final TermFrequencyAttribute tfAtt =
>> addAttribute(TermFrequencyAttribute.class);
>> +
>> +          @Override
>> +          public boolean incrementToken() throws IOException {
>> +            if (input.incrementToken()) {
>> +              final char[] buffer = termAtt.buffer();
>> +              final int length = termAtt.length();
>> +              for (int i = 0; i < length; i++) {
>> +                if (buffer[i] == '|') {
>> +                  termAtt.setLength(i);
>> +                  i++;
>> +                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i,
>> length - i));
>> +                  return true;
>> +                }
>> +              }
>> +              return true;
>> +            }
>> +            return false;
>> +          }
>> +        };
>> +      }
>> +    };
>> +
>> +    mlt.setAnalyzer(analyzer);
>> +    mlt.setFieldNames(new String[] {"text"});
>> +    mlt.setBoost(true);
>> +
>> +    final double boost10 = ((BooleanQuery) mlt.like("text", new
>> StringReader("lucene|10 release|1")))
>> +        .clauses()
>> +        .stream()
>> +        .map(BooleanClause::getQuery)
>> +        .map(BoostQuery.class::cast)
>> +        .filter(x -> ((TermQuery)
>> x.getQuery()).getTerm().text().equals("lucene"))
>> +        .mapToDouble(BoostQuery::getBoost)
>> +        .sum();
>> +
>> +    final double boost1 = ((BooleanQuery) mlt.like("text", new
>> StringReader("lucene|1 release|1")))
>> +        .clauses()
>> +        .stream()
>> +        .map(BooleanClause::getQuery)
>> +        .map(BoostQuery.class::cast)
>> +        .filter(x -> ((TermQuery)
>> x.getQuery()).getTerm().text().equals("lucene"))
>> +        .mapToDouble(BoostQuery::getBoost)
>> +        .sum();
>> +
>> +    // mlt should use the custom frequencies provided by the analyzer so
>> "lucene|10" should be boosted more than "lucene|1"
>> +    assertTrue(String.format("%s should be grater than %s", boost10,
>> boost1), boost10 > boost1);
>> +  }
>> +
>>    // TODO: add tests for the MoreLikeThisQuery
>>  }
>>
>>
>
> --
> http://www.needhamsoftware.com (work)
> http://www.the111shift.com (play)
>

Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Reply via email to