[
https://issues.apache.org/jira/browse/LUCENE-7603?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15785544#comment-15785544
]
ASF GitHub Bot commented on LUCENE-7603:
----------------------------------------
Github user dsmiley commented on a diff in the pull request:
https://github.com/apache/lucene-solr/pull/129#discussion_r94148633
--- Diff:
lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
---
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.graph;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import
org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.FiniteStringsIterator;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.Transition;
+
+import static
org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
+/**
+ * Creates a list of {@link TokenStream} where each stream is the tokens
that make up a finite string in graph token stream. To do this,
+ * the graph token stream is converted to an {@link Automaton} and from
there we use a {@link FiniteStringsIterator} to collect the various
+ * token streams for each finite string.
+ */
+public class GraphTokenStreamFiniteStrings {
+ /* TODO:
+ Most of this is a combination of code from TermAutomatonQuery and
TokenStreamToTermAutomatonQuery. Would be
+ good to make this so it could be shared. */
+ private final Automaton.Builder builder;
+ Automaton det;
+ private final Map<BytesRef, Integer> termToID = new HashMap<>();
+ private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
+ private int anyTermID = -1;
+
+ public GraphTokenStreamFiniteStrings() {
+ this.builder = new Automaton.Builder();
+ }
+
+ private static class BytesRefArrayTokenStream extends TokenStream {
+ private final BytesTermAttribute termAtt =
addAttribute(BytesTermAttribute.class);
+ private final BytesRef[] terms;
+ private int offset;
+
+ BytesRefArrayTokenStream(BytesRef[] terms) {
+ this.terms = terms;
+ offset = 0;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (offset < terms.length) {
+ clearAttributes();
+ termAtt.setBytesRef(terms[offset]);
+ offset = offset + 1;
+ return true;
+ }
+
+ return false;
+ }
+ }
+
+ /**
+ * Gets the list of finite string token streams from the given input
graph token stream.
+ */
+ public List<TokenStream> getTokenStreams(final TokenStream in) throws
IOException {
+ // build automation
+ build(in);
+
+ List<TokenStream> tokenStreams = new ArrayList<>();
+ final FiniteStringsIterator finiteStrings = new
FiniteStringsIterator(det);
+ for (IntsRef string; (string = finiteStrings.next()) != null; ) {
+ final BytesRef[] tokens = new BytesRef[string.length];
+ for (int idx = string.offset, len = string.offset + string.length;
idx < len; idx++) {
+ tokens[idx - string.offset] = idToTerm.get(string.ints[idx]);
+ }
+
+ tokenStreams.add(new BytesRefArrayTokenStream(tokens));
+ }
+
+ return tokenStreams;
+ }
+
+ private void build(final TokenStream in) throws IOException {
+ if (det != null) {
+ throw new IllegalStateException("Automation already built");
+ }
+
+ final TermToBytesRefAttribute termBytesAtt =
in.addAttribute(TermToBytesRefAttribute.class);
+ final PositionIncrementAttribute posIncAtt =
in.addAttribute(PositionIncrementAttribute.class);
+ final PositionLengthAttribute posLengthAtt =
in.addAttribute(PositionLengthAttribute.class);
+ final OffsetAttribute offsetAtt =
in.addAttribute(OffsetAttribute.class);
+
+ in.reset();
+
+ int pos = -1;
+ int lastPos = 0;
+ int maxOffset = 0;
+ int maxPos = -1;
+ int state = -1;
+ while (in.incrementToken()) {
+ int posInc = posIncAtt.getPositionIncrement();
+ assert pos > -1 || posInc > 0;
+
+ if (posInc > 1) {
--- End diff --
This seems like a notable limitation that should be documented in javadocs
somewhere. Can't we support holes without demanding the stream use '*' ? And
might there be a test for this?
> Support Graph Token Streams in QueryBuilder
> -------------------------------------------
>
> Key: LUCENE-7603
> URL: https://issues.apache.org/jira/browse/LUCENE-7603
> Project: Lucene - Core
> Issue Type: Improvement
> Components: core/queryparser, core/search
> Reporter: Matt Weber
>
> With [LUCENE-6664|https://issues.apache.org/jira/browse/LUCENE-6664] we can
> use multi-term synonyms query time. A "graph token stream" will be created
> which which is nothing more than using the position length attribute on
> stacked tokens to indicate how many positions a token should span. Currently
> the position length attribute on tokens is ignored during query parsing.
> This issue will add support for handling these graph token streams inside the
> QueryBuilder utility class used by query parsers.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]