[ https://issues.apache.org/jira/browse/LUCENE-7603?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15787413#comment-15787413 ]
ASF GitHub Bot commented on LUCENE-7603: ---------------------------------------- Github user mikemccand commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/129#discussion_r94217160 --- Diff: lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java --- @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.graph; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.FiniteStringsIterator; +import org.apache.lucene.util.automaton.Operations; + +import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; + +/** + * Creates a list of {@link TokenStream} where each stream is the tokens that make up a finite string in graph token stream. To do this, + * the graph token stream is converted to an {@link Automaton} and from there we use a {@link FiniteStringsIterator} to collect the various + * token streams for each finite string. + */ +public class GraphTokenStreamFiniteStrings { + /* TODO: + Most of this is a combination of code from TermAutomatonQuery and TokenStreamToTermAutomatonQuery. Would be + good to make this so it could be shared. */ + private final Automaton.Builder builder; + Automaton det; + private final Map<BytesRef, Integer> termToID = new HashMap<>(); + private final Map<Integer, BytesRef> idToTerm = new HashMap<>(); + private final Map<Integer, Integer> idToInc = new HashMap<>(); + + public GraphTokenStreamFiniteStrings() { + this.builder = new Automaton.Builder(); + } + + private static class BytesRefArrayTokenStream extends TokenStream { + private final BytesTermAttribute termAtt = addAttribute(BytesTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + private final BytesRef[] terms; + private final int[] increments; + private int offset; + + BytesRefArrayTokenStream(BytesRef[] terms, int[] increments) { + this.terms = terms; + this.increments = increments; + assert terms.length == increments.length; + offset = 0; + } + + @Override + public boolean incrementToken() throws IOException { + if (offset < terms.length) { + clearAttributes(); + termAtt.setBytesRef(terms[offset]); + posIncAtt.setPositionIncrement(increments[offset]); + offset = offset + 1; + return true; + } + + return false; + } + } + + /** + * Gets the list of finite string token streams from the given input graph token stream. + */ + public List<TokenStream> getTokenStreams(final TokenStream in) throws IOException { --- End diff -- Could we make this method private, make this class's constructor private, and add a `static` method here, the sole public method on this class, that receives the incoming `TokenStream` and returns the resulting `TokenStream[]`? Otherwise the API is sort of awkard, since e.g. this method seems like a getter yet it's doing lots of side-effects under the hood ... > Support Graph Token Streams in QueryBuilder > ------------------------------------------- > > Key: LUCENE-7603 > URL: https://issues.apache.org/jira/browse/LUCENE-7603 > Project: Lucene - Core > Issue Type: Improvement > Components: core/queryparser, core/search > Reporter: Matt Weber > > With [LUCENE-6664|https://issues.apache.org/jira/browse/LUCENE-6664] we can > use multi-term synonyms query time. A "graph token stream" will be created > which which is nothing more than using the position length attribute on > stacked tokens to indicate how many positions a token should span. Currently > the position length attribute on tokens is ignored during query parsing. > This issue will add support for handling these graph token streams inside the > QueryBuilder utility class used by query parsers. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org