Sorry wrong issue number! ----- Uwe Schindler Achterdiek 19, D-28357 Bremen http://www.thetaphi.de eMail: [email protected]
> -----Original Message----- > From: [email protected] [mailto:[email protected]] > Sent: Friday, June 9, 2017 11:53 PM > To: [email protected] > Subject: lucene-solr:master: LUCENE-7854: Add a new > DelimitedTermFrequencyTokenFilter that allows to mark tokens with a > custom term frequency > > Repository: lucene-solr > Updated Branches: > refs/heads/master c37b37743 -> 5844ed4ac > > > LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to > mark tokens with a custom term frequency > > > Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo > Commit: http://git-wip-us.apache.org/repos/asf/lucene- > solr/commit/5844ed4a > Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5844ed4a > Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5844ed4a > > Branch: refs/heads/master > Commit: 5844ed4ac95373cbdb512e84b8ad08f78c2baf57 > Parents: c37b377 > Author: Uwe Schindler <[email protected]> > Authored: Fri Jun 9 23:52:19 2017 +0200 > Committer: Uwe Schindler <[email protected]> > Committed: Fri Jun 9 23:52:19 2017 +0200 > > ---------------------------------------------------------------------- > lucene/CHANGES.txt | 6 ++ > .../DelimitedTermFrequencyTokenFilter.java | 75 +++++++++++++++++++ > ...elimitedTermFrequencyTokenFilterFactory.java | 53 ++++++++++++++ > ...ache.lucene.analysis.util.TokenFilterFactory | 1 + > .../DelimitedTermFrequencyTokenFilterTest.java | 77 > ++++++++++++++++++++ > 5 files changed, 212 insertions(+) > ---------------------------------------------------------------------- > > > http://git-wip-us.apache.org/repos/asf/lucene- > solr/blob/5844ed4a/lucene/CHANGES.txt > ---------------------------------------------------------------------- > diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt > index 0251243..12e5000 100644 > --- a/lucene/CHANGES.txt > +++ b/lucene/CHANGES.txt > @@ -18,6 +18,12 @@ New Features > with a custom token stream allows indexing custom term frequencies > (Mike McCandless) > > +* LUCENE-7866: Add a new DelimitedTermFrequencyTokenFilter that allows > to > + mark tokens with a custom term frequency (LUCENE-7854). It parses a > numeric > + value after a separator char ('|') at the end of each token and changes > + the term frequency to this value. (Uwe Schindler, Robert Muir, > + Mike McCandless) > + > API Changes > > * LUCENE-2605: Classic QueryParser no longer splits on whitespace by > default. > > http://git-wip-us.apache.org/repos/asf/lucene- > solr/blob/5844ed4a/lucene/analysis/common/src/java/org/apache/lucene/ > analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java > ---------------------------------------------------------------------- > diff --git > a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilter.java > b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilter.java > new file mode 100644 > index 0000000..e2095ad > --- /dev/null > +++ > b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilter.java > @@ -0,0 +1,75 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +package org.apache.lucene.analysis.miscellaneous; > + > +import java.io.IOException; > + > +import org.apache.lucene.analysis.TokenFilter; > +import org.apache.lucene.analysis.TokenStream; > +import > org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; > +import org.apache.lucene.index.IndexOptions; > +import org.apache.lucene.util.ArrayUtil; > +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; > + > + > +/** > + * Characters before the delimiter are the "token", the textual integer after > is the term frequency. > + * To use this {@code TokenFilter} the field must be indexed with > + * {@link IndexOptions#DOCS_AND_FREQS} but no positions or offsets. > + * <p> > + * For example, if the delimiter is '|', then for the string "foo|5", "foo" > is the > token > + * and "5" is a term frequency. If there is no delimiter, the TokenFilter > does > not modify > + * the term frequency. > + * <p> > + * Note make sure your Tokenizer doesn't split on the delimiter, or this > won't work > + */ > +public final class DelimitedTermFrequencyTokenFilter extends TokenFilter { > + public static final char DEFAULT_DELIMITER = '|'; > + > + private final char delimiter; > + private final CharTermAttribute termAtt = > addAttribute(CharTermAttribute.class); > + private final TermFrequencyAttribute tfAtt = > addAttribute(TermFrequencyAttribute.class); > + > + > + public DelimitedTermFrequencyTokenFilter(TokenStream input) { > + this(input, DEFAULT_DELIMITER); > + } > + > + public DelimitedTermFrequencyTokenFilter(TokenStream input, char > delimiter) { > + super(input); > + this.delimiter = delimiter; > + } > + > + @Override > + public boolean incrementToken() throws IOException { > + if (input.incrementToken()) { > + final char[] buffer = termAtt.buffer(); > + final int length = termAtt.length(); > + for (int i = 0; i < length; i++) { > + if (buffer[i] == delimiter) { > + termAtt.setLength(i); // simply set a new length > + i++; > + tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i)); > + return true; > + } > + } > + return true; > + } > + return false; > + } > +} > > http://git-wip-us.apache.org/repos/asf/lucene- > solr/blob/5844ed4a/lucene/analysis/common/src/java/org/apache/lucene/ > analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java > ---------------------------------------------------------------------- > diff --git > a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilterFactory.java > b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilterFactory.java > new file mode 100644 > index 0000000..af5c0fa > --- /dev/null > +++ > b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilterFactory.java > @@ -0,0 +1,53 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +package org.apache.lucene.analysis.miscellaneous; > + > +import java.util.Map; > + > +import org.apache.lucene.analysis.TokenStream; > +import org.apache.lucene.analysis.util.TokenFilterFactory; > + > +/** > + * Factory for {@link DelimitedTermFrequencyTokenFilter}. The field must > have {@code omitPositions=true}. > + * <pre class="prettyprint"> > + * <fieldType name="text_tfdl" class="solr.TextField" > omitPositions="true"> > + * <analyzer> > + * <tokenizer class="solr.WhitespaceTokenizerFactory"/> > + * <filter class="solr.DelimitedTermFrequencyTokenFilterFactory" > delimiter="|"/> > + * </analyzer> > + * </fieldType></pre> > + */ > +public class DelimitedTermFrequencyTokenFilterFactory extends > TokenFilterFactory { > + public static final String DELIMITER_ATTR = "delimiter"; > + > + private final char delimiter; > + > + /** Creates a new DelimitedPayloadTokenFilterFactory */ > + public DelimitedTermFrequencyTokenFilterFactory(Map<String, String> > args) { > + super(args); > + delimiter = getChar(args, DELIMITER_ATTR, > DelimitedTermFrequencyTokenFilter.DEFAULT_DELIMITER); > + if (!args.isEmpty()) { > + throw new IllegalArgumentException("Unknown parameters: " + args); > + } > + } > + > + @Override > + public DelimitedTermFrequencyTokenFilter create(TokenStream input) { > + return new DelimitedTermFrequencyTokenFilter(input, delimiter); > + } > +} > \ No newline at end of file > > http://git-wip-us.apache.org/repos/asf/lucene- > solr/blob/5844ed4a/lucene/analysis/common/src/resources/META- > INF/services/org.apache.lucene.analysis.util.TokenFilterFactory > ---------------------------------------------------------------------- > diff --git a/lucene/analysis/common/src/resources/META- > INF/services/org.apache.lucene.analysis.util.TokenFilterFactory > b/lucene/analysis/common/src/resources/META- > INF/services/org.apache.lucene.analysis.util.TokenFilterFactory > index 4e33006..bc19c4a 100644 > --- a/lucene/analysis/common/src/resources/META- > INF/services/org.apache.lucene.analysis.util.TokenFilterFactory > +++ b/lucene/analysis/common/src/resources/META- > INF/services/org.apache.lucene.analysis.util.TokenFilterFactory > @@ -63,6 +63,7 @@ > org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory > org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory > org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory > org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory > +org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFil > terFactory > org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory > org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory > org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory > > http://git-wip-us.apache.org/repos/asf/lucene- > solr/blob/5844ed4a/lucene/analysis/common/src/test/org/apache/lucene/a > nalysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java > ---------------------------------------------------------------------- > diff --git > a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilterTest.java > b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilterTest.java > new file mode 100644 > index 0000000..7609f6e > --- /dev/null > +++ > b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellane > ous/DelimitedTermFrequencyTokenFilterTest.java > @@ -0,0 +1,77 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +package org.apache.lucene.analysis.miscellaneous; > + > +import org.apache.lucene.analysis.BaseTokenStreamTestCase; > +import org.apache.lucene.analysis.TokenStream; > +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; > +import > org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; > + > +public class DelimitedTermFrequencyTokenFilterTest extends > BaseTokenStreamTestCase { > + > + public void testTermFrequency() throws Exception { > + String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 > brown|123 dogs|1024"; > + DelimitedTermFrequencyTokenFilter filter = > + new > DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test)); > + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); > + TermFrequencyAttribute tfAtt = > filter.getAttribute(TermFrequencyAttribute.class); > + filter.reset(); > + assertTermEquals("The", filter, termAtt, tfAtt, 1); > + assertTermEquals("quick", filter, termAtt, tfAtt, 40); > + assertTermEquals("red", filter, termAtt, tfAtt, 4); > + assertTermEquals("fox", filter, termAtt, tfAtt, 6); > + assertTermEquals("jumped", filter, termAtt, tfAtt, 1); > + assertTermEquals("over", filter, termAtt, tfAtt, 1); > + assertTermEquals("the", filter, termAtt, tfAtt, 1); > + assertTermEquals("lazy", filter, termAtt, tfAtt, 2); > + assertTermEquals("brown", filter, termAtt, tfAtt, 123); > + assertTermEquals("dogs", filter, termAtt, tfAtt, 1024); > + assertFalse(filter.incrementToken()); > + filter.end(); > + filter.close(); > + } > + > + public void testInvalidNegativeTf() throws Exception { > + String test = "foo bar|-20"; > + DelimitedTermFrequencyTokenFilter filter = > + new > DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test)); > + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); > + TermFrequencyAttribute tfAtt = > filter.getAttribute(TermFrequencyAttribute.class); > + filter.reset(); > + assertTermEquals("foo", filter, termAtt, tfAtt, 1); > + IllegalArgumentException iae = > expectThrows(IllegalArgumentException.class, filter::incrementToken); > + assertEquals("Term frequency must be 1 or greater; got -20", > iae.getMessage()); > + } > + > + public void testInvalidFloatTf() throws Exception { > + String test = "foo bar|1.2"; > + DelimitedTermFrequencyTokenFilter filter = > + new > DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test)); > + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); > + TermFrequencyAttribute tfAtt = > filter.getAttribute(TermFrequencyAttribute.class); > + filter.reset(); > + assertTermEquals("foo", filter, termAtt, tfAtt, 1); > + expectThrows(NumberFormatException.class, filter::incrementToken); > + } > + > + void assertTermEquals(String expected, TokenStream stream, > CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) > throws Exception { > + assertTrue(stream.incrementToken()); > + assertEquals(expected, termAtt.toString()); > + assertEquals(expectedTf, tfAtt.getTermFrequency()); > + } > +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
