cpoerschke commented on a change in pull request #476:
URL: https://github.com/apache/solr/pull/476#discussion_r786178075
##########
File path: solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
##########
@@ -114,6 +117,25 @@ public DocValuesFormat getDocValuesFormatForField(String
field) {
}
return super.getDocValuesFormatForField(field);
}
+
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ final SchemaField schemaField =
core.getLatestSchema().getFieldOrNull(field);
+ if (schemaField != null && schemaField.getType() instanceof
DenseVectorField) {
+ DenseVectorField vectorType = (DenseVectorField)
schemaField.getType();
Review comment:
minor: the following could avoid duplicate `schemaField.getType()` calls
```suggestion
final FieldType fieldType = (schemaField == null ? null :
schemaField.getType());
if (fieldType instanceof DenseVectorField) {
DenseVectorField vectorType = (DenseVectorField) fieldType;
```
##########
File path: solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.KnnVectorQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import static java.util.Optional.ofNullable;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+
+/**
+ * Provides a field type to support Lucene's {@link
+ * org.apache.lucene.document.KnnVectorField}.
+ * See {@link org.apache.lucene.search.KnnVectorQuery} for more details.
+ * It supports a fixed cardinality dimension for the vector and a fixed
similarity function.
+ * The default similarity is EUCLIDEAN_HNSW (L2).
+ * The default index codec format is specified in the Lucene Codec constructor.
+ * For Lucene 9.0 e.g.
+ * See {@link org.apache.lucene.codecs.lucene90.Lucene90Codec}
+ * Currently only {@link
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat} is supported for
+ * advanced hyper-parameter customisation.
+ * See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details about
the implementation.
+ *
+ * <br>
+ * Only {@code Indexed} and {@code Stored} attributes are supported.
+ */
+public class DenseVectorField extends FloatPointField {
+
+ static final String KNN_VECTOR_DIMENSION = "vectorDimension";
+ static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
+
+ static final String CODEC_FORMAT = "codecFormat";
+ static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
+ static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
+
+ int dimension;
+ VectorSimilarityFunction similarityFunction;
+ VectorSimilarityFunction DEFAULT_SIMILARITY =
VectorSimilarityFunction.EUCLIDEAN;
+
+ String codecFormat;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * Controls how many of the nearest neighbor candidates are connected to
the new node. Defaults to
+ * {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link
HnswGraph} for more details.
+ */
+ int hnswMaxConn;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * The number of candidate neighbors to track while searching the graph
for each newly inserted
+ * node. Defaults to to {@link
Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
+ * HnswGraph} for details.
+ */
+ int hnswBeamWidth;
+
+ @Override
+ public void init(IndexSchema schema, Map<String, String> args) {
+ this.dimension = ofNullable(args.get(KNN_VECTOR_DIMENSION))
+ .map(value -> Integer.parseInt(value))
+ .orElseThrow(() -> new
SolrException(SolrException.ErrorCode.SERVER_ERROR, "the vector dimension is a
mandatory parameter"));
+ args.remove(KNN_VECTOR_DIMENSION);
+
+ this.similarityFunction = ofNullable(args.get(KNN_SIMILARITY_FUNCTION))
+ .map(value ->
VectorSimilarityFunction.valueOf(value.toUpperCase(Locale.ROOT)))
+ .orElse(DEFAULT_SIMILARITY);
+ args.remove(KNN_SIMILARITY_FUNCTION);
+
+ this.codecFormat = args.get(CODEC_FORMAT);
+ args.remove(CODEC_FORMAT);
+
+ this.hnswMaxConn = ofNullable(args.get(HNSW_MAX_CONNECTIONS))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_MAX_CONN);
+ args.remove(HNSW_MAX_CONNECTIONS);
+
+ this.hnswBeamWidth = ofNullable(args.get(HNSW_BEAM_WIDTH))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_BEAM_WIDTH);
+ args.remove(HNSW_BEAM_WIDTH);
+
+ this.properties &= ~MULTIVALUED;
+ this.properties &= ~UNINVERTIBLE;
+
+ super.init(schema, args);
+ }
+
+ public int getDimension() {
+ return dimension;
+ }
+
+ public String getCodecFormat() {
+ return codecFormat;
+ }
+
+ public Integer getHnswMaxConn() {
+ return hnswMaxConn;
+ }
+
+ public Integer getHnswBeamWidth() {
+ return hnswBeamWidth;
+ }
+
+ @Override
+ public void checkSchemaField(final SchemaField field) throws SolrException
{
+ super.checkSchemaField(field);
+ if (field.multiValued()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not be
multiValued: " + field.getName());
+ }
+
+ if (field.hasDocValues()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not have
docValues: " + field.getName());
+ }
+ }
+
+ public List<IndexableField> createFields(SchemaField field, Object value) {
+ List<IndexableField> fields = new ArrayList<>();
+ float[] parsedVector;
+ try {
+ parsedVector = parseVector(value);
+ } catch (RuntimeException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error while creating field '" + field + "' from value '" + value + "',
expected format:'[f1, f2, f3...fn]' e.g. [1.0, 3.4, 5.6]", e);
+ }
+
+ if (field.indexed()) {
+ fields.add(createField(field, parsedVector));
+ }
+ if (field.stored()) {
+ for (float vectorElement : parsedVector) {
+ fields.add(getStoredField(field, vectorElement));
+ }
+ }
+ return fields;
+ }
+
+ @Override
+ public IndexableField createField(SchemaField field, Object parsedVector) {
+ float[] typedVector;
+ if (parsedVector == null) return null;
+ typedVector = (float[]) parsedVector;
+ return new KnnVectorField(field.getName(), typedVector,
similarityFunction);
+ }
+
+ /**
+ * Index Time Parsing
+ * The inputValue is an ArrayList with a type that dipends on the loader
used:
+ * - {@link org.apache.solr.handler.loader.XMLLoader}, {@link
org.apache.solr.handler.loader.CSVLoader} produces an ArrayList of String
+ * - {@link org.apache.solr.handler.loader.JsonLoader} produces an
ArrayList of Double
+ * - {@link org.apache.solr.handler.loader.JavabinLoader} produces an
ArrayList of Float
+ *
+ * @param inputValue - An {@link ArrayList} containing the elements of the
vector
+ * @return the vector parsed
+ */
+ float[] parseVector(Object inputValue) {
+ if (!(inputValue instanceof List)) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"incorrect vector format." +
+ " The expected format is an array :'[f1,f2..f3]' where
each element f is a float");
+ }
+ List<?> inputVector = (List<?>) inputValue;
+ if (inputVector.size() != dimension) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"incorrect vector dimension." +
+ " The vector value has size "
+ + inputVector.size() + " while it is expected a vector
with size " + dimension);
+ }
+
+ float[] vector = new float[dimension];
+ if (inputVector.get(0) instanceof CharSequence) {
+ for (int i = 0; i < dimension; i++) {
+ try {
+ vector[i] =
Float.parseFloat(inputVector.get(i).toString());
+ } catch (NumberFormatException e) {
+ throw new
SolrException(SolrException.ErrorCode.BAD_REQUEST, "incorrect vector element:
'" + inputVector.get(i) +
+ "'. The expected format is:'[f1,f2..f3]' where
each element f is a float");
+ }
+ }
+ } else if (inputVector.get(0) instanceof Number) {
+ for (int i = 0; i < dimension; i++) {
+ vector[i] = ((Number) inputVector.get(i)).floatValue();
+ }
+ }
Review comment:
should this throw also or could we have a comment re: that/why it's okay
to just return default vector?
```suggestion
} else {
throw new SolrException(...)
}
```
##########
File path: solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
##########
@@ -151,63 +153,49 @@ public static Document toDocument(SolrInputDocument doc,
IndexSchema schema, boo
String name = field.getName();
SchemaField sfield = schema.getFieldOrNull(name);
- boolean used = false;
+ List<CopyField> copyFields = schema.getCopyFieldsList(name);
+ if( copyFields.size() == 0 ) copyFields = null;
Review comment:
```suggestion
if( copyFields.isEmpty() ) copyFields = null;
```
##########
File path: solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
##########
@@ -258,6 +246,48 @@ public static Document toDocument(SolrInputDocument doc,
IndexSchema schema, boo
return out;
}
+ private static boolean addOriginalField( Object originalFieldValue,
SchemaField sfield, boolean forInPlaceUpdate, Document out, Set<String>
usedFields) {
+ addField(out, sfield, originalFieldValue, forInPlaceUpdate);
+ // record the field as having a value
+ usedFields.add(sfield.getName());
+ return true;
+ }
+
+ private static boolean addCopyFields(Object originalFieldValue, FieldType
originalFieldType, List<CopyField> copyFields, boolean forInPlaceUpdate, String
uniqueKeyFieldName, Document out, Set<String> usedFields) {
+ boolean used = false;
+ for (CopyField cf : copyFields) {
+ SchemaField destinationField = cf.getDestination();
+
+ final boolean destHasValues =
usedFields.contains(destinationField.getName());
+
+ // Dense Vector Fields can only be copied to same field type
+ if (originalFieldType instanceof DenseVectorField &&
!(destinationField.getType() instanceof DenseVectorField)) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ "The copy field destination must be a DenseVectorField: " +
+ destinationField.getName());
+ }
Review comment:
Is there also a requirement for dimension alignment i.e. `dst` must not
be higher dimension than `src` and/or they should be equal? Maybe that's
already covered elsewhere, I haven't looked.
##########
File path:
solr/core/src/test-files/solr/collection1/conf/bad-schema-densevector-docvalues.xml
##########
@@ -0,0 +1,31 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for DenseVectorField -->
+
+<schema name="bad-schema-densevector-similarity-null" version="1.0">
Review comment:
possibly unintended name attribute vs. file name mismatch
```suggestion
<schema name="bad-schema-densevector-docvalues" version="1.0">
```
##########
File path: solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java
##########
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.neural;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.lucene.search.Query;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.DenseVectorField;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.QueryParsing;
+
+public class KnnQParser extends QParser {
+
+ static final String TOP_K = "topK";// retrieve the top K results based on
the distance similarity function
+ static final int DEFAULT_TOP_K = 10;
+
+ /**
+ * Constructor for the QParser
+ *
+ * @param qstr The part of the query string specific to this parser
+ * @param localParams The set of parameters that are specific to this
QParser. See https://solr.apache.org/guide/local-parameters-in-queries.html
+ * @param params The rest of the {@link SolrParams}
+ * @param req The original {@link SolrQueryRequest}.
+ */
+ public KnnQParser(String qstr, SolrParams localParams, SolrParams params,
SolrQueryRequest req) {
+ super(qstr, localParams, params, req);
+ }
+
+ @Override
+ public Query parse() {
+ String denseVectorField = localParams.get(QueryParsing.F);
+ String vectorToSearch = localParams.get(QueryParsing.V);
+ int topK = localParams.getInt(TOP_K, DEFAULT_TOP_K);
+
+ if (denseVectorField == null || denseVectorField.isEmpty()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "the
Dense Vector field 'f' is missing");
+ }
+
+ if (vectorToSearch == null || vectorToSearch.isEmpty()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "the
Dense Vector to search is missing");
+ }
+
+ SchemaField schemaField =
req.getCore().getLatestSchema().getField(denseVectorField);
+ FieldType fieldType = schemaField.getType();
+ if (!(fieldType instanceof DenseVectorField)) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "only
DenseVectorField is compatible with Knn Query Parser");
+ }
+
+ DenseVectorField denseVectorType = (DenseVectorField) fieldType;
+ float[] parsedVectorToSearch = parseVector(vectorToSearch,
denseVectorType.getDimension());
+ return denseVectorType.getKnnVectorQuery(schemaField,
parsedVectorToSearch, topK);
+ }
+
+ /**
+ * Parses a String vector.
+ *
+ * @param value with format: [f1, f2, f3, f4...fn]
+ * @return a float array
+ */
+ private float[] parseVector(String value, int dimension) {
Review comment:
```suggestion
static private float[] parseVector(String value, int dimension) {
```
##########
File path: solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.KnnVectorQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import static java.util.Optional.ofNullable;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+
+/**
+ * Provides a field type to support Lucene's {@link
+ * org.apache.lucene.document.KnnVectorField}.
+ * See {@link org.apache.lucene.search.KnnVectorQuery} for more details.
+ * It supports a fixed cardinality dimension for the vector and a fixed
similarity function.
+ * The default similarity is EUCLIDEAN_HNSW (L2).
+ * The default index codec format is specified in the Lucene Codec constructor.
+ * For Lucene 9.0 e.g.
+ * See {@link org.apache.lucene.codecs.lucene90.Lucene90Codec}
+ * Currently only {@link
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat} is supported for
+ * advanced hyper-parameter customisation.
+ * See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details about
the implementation.
+ *
+ * <br>
+ * Only {@code Indexed} and {@code Stored} attributes are supported.
+ */
+public class DenseVectorField extends FloatPointField {
+
+ static final String KNN_VECTOR_DIMENSION = "vectorDimension";
+ static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
+
+ static final String CODEC_FORMAT = "codecFormat";
+ static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
+ static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
+
+ int dimension;
+ VectorSimilarityFunction similarityFunction;
+ VectorSimilarityFunction DEFAULT_SIMILARITY =
VectorSimilarityFunction.EUCLIDEAN;
+
+ String codecFormat;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * Controls how many of the nearest neighbor candidates are connected to
the new node. Defaults to
+ * {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link
HnswGraph} for more details.
+ */
+ int hnswMaxConn;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * The number of candidate neighbors to track while searching the graph
for each newly inserted
+ * node. Defaults to to {@link
Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
+ * HnswGraph} for details.
+ */
+ int hnswBeamWidth;
+
+ @Override
+ public void init(IndexSchema schema, Map<String, String> args) {
+ this.dimension = ofNullable(args.get(KNN_VECTOR_DIMENSION))
+ .map(value -> Integer.parseInt(value))
+ .orElseThrow(() -> new
SolrException(SolrException.ErrorCode.SERVER_ERROR, "the vector dimension is a
mandatory parameter"));
+ args.remove(KNN_VECTOR_DIMENSION);
+
+ this.similarityFunction = ofNullable(args.get(KNN_SIMILARITY_FUNCTION))
+ .map(value ->
VectorSimilarityFunction.valueOf(value.toUpperCase(Locale.ROOT)))
+ .orElse(DEFAULT_SIMILARITY);
+ args.remove(KNN_SIMILARITY_FUNCTION);
+
+ this.codecFormat = args.get(CODEC_FORMAT);
+ args.remove(CODEC_FORMAT);
+
+ this.hnswMaxConn = ofNullable(args.get(HNSW_MAX_CONNECTIONS))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_MAX_CONN);
+ args.remove(HNSW_MAX_CONNECTIONS);
+
+ this.hnswBeamWidth = ofNullable(args.get(HNSW_BEAM_WIDTH))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_BEAM_WIDTH);
+ args.remove(HNSW_BEAM_WIDTH);
+
+ this.properties &= ~MULTIVALUED;
+ this.properties &= ~UNINVERTIBLE;
+
+ super.init(schema, args);
+ }
+
+ public int getDimension() {
+ return dimension;
+ }
+
+ public String getCodecFormat() {
+ return codecFormat;
+ }
+
+ public Integer getHnswMaxConn() {
+ return hnswMaxConn;
+ }
+
+ public Integer getHnswBeamWidth() {
+ return hnswBeamWidth;
+ }
+
+ @Override
+ public void checkSchemaField(final SchemaField field) throws SolrException
{
+ super.checkSchemaField(field);
+ if (field.multiValued()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not be
multiValued: " + field.getName());
+ }
+
+ if (field.hasDocValues()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not have
docValues: " + field.getName());
+ }
+ }
+
+ public List<IndexableField> createFields(SchemaField field, Object value) {
+ List<IndexableField> fields = new ArrayList<>();
+ float[] parsedVector;
+ try {
+ parsedVector = parseVector(value);
+ } catch (RuntimeException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error while creating field '" + field + "' from value '" + value + "',
expected format:'[f1, f2, f3...fn]' e.g. [1.0, 3.4, 5.6]", e);
+ }
+
+ if (field.indexed()) {
+ fields.add(createField(field, parsedVector));
+ }
+ if (field.stored()) {
+ for (float vectorElement : parsedVector) {
+ fields.add(getStoredField(field, vectorElement));
+ }
+ }
+ return fields;
+ }
+
+ @Override
+ public IndexableField createField(SchemaField field, Object parsedVector) {
+ float[] typedVector;
+ if (parsedVector == null) return null;
+ typedVector = (float[]) parsedVector;
Review comment:
```suggestion
if (parsedVector == null) return null;
float[] typedVector = (float[]) parsedVector;
```
##########
File path: solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java
##########
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.neural;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.lucene.search.Query;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.DenseVectorField;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.QueryParsing;
+
+public class KnnQParser extends QParser {
+
+ static final String TOP_K = "topK";// retrieve the top K results based on
the distance similarity function
+ static final int DEFAULT_TOP_K = 10;
+
+ /**
+ * Constructor for the QParser
+ *
+ * @param qstr The part of the query string specific to this parser
+ * @param localParams The set of parameters that are specific to this
QParser. See https://solr.apache.org/guide/local-parameters-in-queries.html
+ * @param params The rest of the {@link SolrParams}
+ * @param req The original {@link SolrQueryRequest}.
+ */
+ public KnnQParser(String qstr, SolrParams localParams, SolrParams params,
SolrQueryRequest req) {
+ super(qstr, localParams, params, req);
+ }
+
+ @Override
+ public Query parse() {
+ String denseVectorField = localParams.get(QueryParsing.F);
+ String vectorToSearch = localParams.get(QueryParsing.V);
+ int topK = localParams.getInt(TOP_K, DEFAULT_TOP_K);
+
+ if (denseVectorField == null || denseVectorField.isEmpty()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "the
Dense Vector field 'f' is missing");
+ }
+
+ if (vectorToSearch == null || vectorToSearch.isEmpty()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "the
Dense Vector to search is missing");
Review comment:
similar to `field 'f'` above could do `value 'v'` here too
```suggestion
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"the Dense Vector value 'v' to search is missing");
```
##########
File path: solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.KnnVectorQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import static java.util.Optional.ofNullable;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+
+/**
+ * Provides a field type to support Lucene's {@link
+ * org.apache.lucene.document.KnnVectorField}.
+ * See {@link org.apache.lucene.search.KnnVectorQuery} for more details.
+ * It supports a fixed cardinality dimension for the vector and a fixed
similarity function.
+ * The default similarity is EUCLIDEAN_HNSW (L2).
+ * The default index codec format is specified in the Lucene Codec constructor.
+ * For Lucene 9.0 e.g.
+ * See {@link org.apache.lucene.codecs.lucene90.Lucene90Codec}
+ * Currently only {@link
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat} is supported for
+ * advanced hyper-parameter customisation.
+ * See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details about
the implementation.
+ *
+ * <br>
+ * Only {@code Indexed} and {@code Stored} attributes are supported.
+ */
+public class DenseVectorField extends FloatPointField {
+
+ static final String KNN_VECTOR_DIMENSION = "vectorDimension";
+ static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
+
+ static final String CODEC_FORMAT = "codecFormat";
+ static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
+ static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
+
+ int dimension;
+ VectorSimilarityFunction similarityFunction;
+ VectorSimilarityFunction DEFAULT_SIMILARITY =
VectorSimilarityFunction.EUCLIDEAN;
+
+ String codecFormat;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * Controls how many of the nearest neighbor candidates are connected to
the new node. Defaults to
+ * {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link
HnswGraph} for more details.
+ */
+ int hnswMaxConn;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * The number of candidate neighbors to track while searching the graph
for each newly inserted
+ * node. Defaults to to {@link
Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
+ * HnswGraph} for details.
+ */
+ int hnswBeamWidth;
+
+ @Override
+ public void init(IndexSchema schema, Map<String, String> args) {
+ this.dimension = ofNullable(args.get(KNN_VECTOR_DIMENSION))
+ .map(value -> Integer.parseInt(value))
+ .orElseThrow(() -> new
SolrException(SolrException.ErrorCode.SERVER_ERROR, "the vector dimension is a
mandatory parameter"));
+ args.remove(KNN_VECTOR_DIMENSION);
+
+ this.similarityFunction = ofNullable(args.get(KNN_SIMILARITY_FUNCTION))
+ .map(value ->
VectorSimilarityFunction.valueOf(value.toUpperCase(Locale.ROOT)))
+ .orElse(DEFAULT_SIMILARITY);
+ args.remove(KNN_SIMILARITY_FUNCTION);
+
+ this.codecFormat = args.get(CODEC_FORMAT);
+ args.remove(CODEC_FORMAT);
+
+ this.hnswMaxConn = ofNullable(args.get(HNSW_MAX_CONNECTIONS))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_MAX_CONN);
+ args.remove(HNSW_MAX_CONNECTIONS);
+
+ this.hnswBeamWidth = ofNullable(args.get(HNSW_BEAM_WIDTH))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_BEAM_WIDTH);
+ args.remove(HNSW_BEAM_WIDTH);
+
+ this.properties &= ~MULTIVALUED;
+ this.properties &= ~UNINVERTIBLE;
+
+ super.init(schema, args);
+ }
+
+ public int getDimension() {
+ return dimension;
+ }
+
+ public String getCodecFormat() {
+ return codecFormat;
+ }
+
+ public Integer getHnswMaxConn() {
+ return hnswMaxConn;
+ }
+
+ public Integer getHnswBeamWidth() {
+ return hnswBeamWidth;
+ }
+
+ @Override
+ public void checkSchemaField(final SchemaField field) throws SolrException
{
+ super.checkSchemaField(field);
+ if (field.multiValued()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not be
multiValued: " + field.getName());
+ }
+
+ if (field.hasDocValues()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not have
docValues: " + field.getName());
+ }
+ }
+
+ public List<IndexableField> createFields(SchemaField field, Object value) {
+ List<IndexableField> fields = new ArrayList<>();
+ float[] parsedVector;
+ try {
+ parsedVector = parseVector(value);
+ } catch (RuntimeException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error while creating field '" + field + "' from value '" + value + "',
expected format:'[f1, f2, f3...fn]' e.g. [1.0, 3.4, 5.6]", e);
+ }
+
Review comment:
if (and that is an if) the `parsedVector` typically is longer than the
default initial `ArrayList` capacity and if `field.stored()` typically is true
then deferring the allocation to here could avoid `fields` resizing as part of
the `for` loop below e.g.
```suggestion
List<IndexableField> fields = new ArrayList<>(parsedVector.length +
1);
```
##########
File path: solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java
##########
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.neural;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.lucene.search.Query;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.DenseVectorField;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.QueryParsing;
+
+public class KnnQParser extends QParser {
+
+ static final String TOP_K = "topK";// retrieve the top K results based on
the distance similarity function
+ static final int DEFAULT_TOP_K = 10;
+
+ /**
+ * Constructor for the QParser
+ *
+ * @param qstr The part of the query string specific to this parser
+ * @param localParams The set of parameters that are specific to this
QParser. See https://solr.apache.org/guide/local-parameters-in-queries.html
+ * @param params The rest of the {@link SolrParams}
+ * @param req The original {@link SolrQueryRequest}.
+ */
+ public KnnQParser(String qstr, SolrParams localParams, SolrParams params,
SolrQueryRequest req) {
+ super(qstr, localParams, params, req);
+ }
+
+ @Override
+ public Query parse() {
+ String denseVectorField = localParams.get(QueryParsing.F);
+ String vectorToSearch = localParams.get(QueryParsing.V);
+ int topK = localParams.getInt(TOP_K, DEFAULT_TOP_K);
+
+ if (denseVectorField == null || denseVectorField.isEmpty()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "the
Dense Vector field 'f' is missing");
+ }
+
+ if (vectorToSearch == null || vectorToSearch.isEmpty()) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "the
Dense Vector to search is missing");
+ }
+
+ SchemaField schemaField =
req.getCore().getLatestSchema().getField(denseVectorField);
+ FieldType fieldType = schemaField.getType();
+ if (!(fieldType instanceof DenseVectorField)) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "only
DenseVectorField is compatible with Knn Query Parser");
+ }
+
+ DenseVectorField denseVectorType = (DenseVectorField) fieldType;
+ float[] parsedVectorToSearch = parseVector(vectorToSearch,
denseVectorType.getDimension());
+ return denseVectorType.getKnnVectorQuery(schemaField,
parsedVectorToSearch, topK);
Review comment:
Wondering if altering the `getKnnVectorQuery` signature might be
possible:
```
- public Query getKnnVectorQuery(SchemaField field, float[] vectorToSearch,
int topK)
+ public Query getKnnVectorQuery(String fieldName, float[] vectorToSearch,
int topK)
```
since only the field name is actually used (at present at least) and here
`denseVectorType` being obtained from `schemaField` but then all of
`schemaField` being passed to a `denseVectorType` method seen counter-intuitive
at first glance.
##########
File path: solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.KnnVectorQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import static java.util.Optional.ofNullable;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+
+/**
+ * Provides a field type to support Lucene's {@link
+ * org.apache.lucene.document.KnnVectorField}.
+ * See {@link org.apache.lucene.search.KnnVectorQuery} for more details.
+ * It supports a fixed cardinality dimension for the vector and a fixed
similarity function.
+ * The default similarity is EUCLIDEAN_HNSW (L2).
+ * The default index codec format is specified in the Lucene Codec constructor.
+ * For Lucene 9.0 e.g.
+ * See {@link org.apache.lucene.codecs.lucene90.Lucene90Codec}
+ * Currently only {@link
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat} is supported for
+ * advanced hyper-parameter customisation.
+ * See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details about
the implementation.
+ *
+ * <br>
+ * Only {@code Indexed} and {@code Stored} attributes are supported.
+ */
+public class DenseVectorField extends FloatPointField {
+
+ static final String KNN_VECTOR_DIMENSION = "vectorDimension";
+ static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
+
+ static final String CODEC_FORMAT = "codecFormat";
+ static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
+ static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
+
+ int dimension;
+ VectorSimilarityFunction similarityFunction;
+ VectorSimilarityFunction DEFAULT_SIMILARITY =
VectorSimilarityFunction.EUCLIDEAN;
+
+ String codecFormat;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * Controls how many of the nearest neighbor candidates are connected to
the new node. Defaults to
+ * {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link
HnswGraph} for more details.
+ */
+ int hnswMaxConn;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * The number of candidate neighbors to track while searching the graph
for each newly inserted
+ * node. Defaults to to {@link
Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
+ * HnswGraph} for details.
+ */
+ int hnswBeamWidth;
Review comment:
could these be `private` visibility since `getDimension()` etc.
accessors are provided below?
##########
File path:
solr/core/src/test-files/solr/collection1/conf/schema-densevector-similarity-null.xml
##########
@@ -0,0 +1,31 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for DenseVectorField -->
+
+<schema name="bad-schema-densevector-similarity-null" version="1.0">
Review comment:
possibly unintended name attribute vs. file name mismatch
```suggestion
<schema name="schema-densevector-similarity-null" version="1.0">
```
##########
File path: solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
##########
@@ -1149,6 +1150,22 @@ public void testQueryMLT() throws Exception {
}
}
+ public void testQueryKNN() throws Exception {
+ SolrInputDocument doc = new SolrInputDocument();
+ doc.addField("id", "0");
+ doc.addField("vector", Arrays.asList(1, 2, 3, 4));
+ assertU(adoc(doc));
+ assertU(commit());
+
+ try {
+ assertQueryEquals("knn", "{!knn f=vector}[1.0,2.0,3.0,4.0]",
+ "{!knn f=vector v=[1.0,2.0,3.0,4.0]}");
+ } finally {
+ delQ("*:*");
Review comment:
Not sure if document adding is necessary for this test (I suspect not)
but if one document is added then should test cleanup also only delete that one
document?
```suggestion
delQ("id:0");
```
##########
File path: solr/core/src/test-files/solr/collection1/conf/schema.xml
##########
@@ -50,7 +50,12 @@
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
<fieldType name="pdate" class="solr.DatePointField" docValues="true"/>
-
+
+ <!-- Dense Vector Fields -->
+ <fieldType name="knn_vector" class="solr.DenseVectorField"
vectorDimension="4" similarityFunction="cosine"/>
+ <fieldType name="knn_vector2" class="solr.DenseVectorField"
vectorDimension="4" similarityFunction="dot_product"/>
Review comment:
perhaps include the dimension and/or similarity function in the type name
```suggestion
<fieldType name="knn_vector_cosine" class="solr.DenseVectorField"
vectorDimension="4" similarityFunction="cosine"/>
<fieldType name="knn_vector_dot_product" class="solr.DenseVectorField"
vectorDimension="4" similarityFunction="dot_product"/>
```
##########
File path:
solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparamer.xml
##########
@@ -0,0 +1,34 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for DenseVectorField -->
+
+<schema name="schema-densevector" version="1.0">
Review comment:
possibly unintended name attribute vs. file name mismatch
```suggestion
<schema name="schema-densevector-codec-hyperparamer" version="1.0">
```
##########
File path:
solr/core/src/test-files/solr/collection1/conf/bad-schema-densevector-multivalued.xml
##########
@@ -0,0 +1,31 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for DenseVectorField -->
+
+<schema name="bad-schema-densevector-similarity-null" version="1.0">
Review comment:
possibly unintended name attribute vs. file name mismatch
```suggestion
<schema name="bad-schema-densevector-multivalued" version="1.0">
```
##########
File path: solr/solr-ref-guide/src/neural-search.adoc
##########
@@ -0,0 +1,322 @@
+= Neural Search
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+Search comprises of performing four primary steps:
+
+* generate a representation of the query that specifies the information need
+* generate a representation of the document that captures the information
contained
+* match the query and the document representations from the corpus of
information
+* assign a score to each matched document in order to establish a meaningful
document ranking by relevance in the results
+
+The Apache Solr *Neural Search* module adds support for neural networks based
techniques that can improve various aspects of search.
+
+These techniques can be differentiated based on whether they affect the query
representation, the document representation, or the estimation of the relevance
score.
+
+Neural Search is an industry derivation from the academic field of
https://www.microsoft.com/en-us/research/uploads/prod/2017/06/fntir2018-neuralir-mitra.pdf[Neural
information Retrieval].
+
+== Neural Search Concepts
+
+=== Deep Learning
+
+More and more frequently, we hear about how Artificial Intelligence (AI)
permeates every aspect of our lives.
+
+When we talk about AI we are referring to a superset of techniques that enable
machines to learn and show intelligence like humans.
Review comment:
Maybe `machine-learning.adoc` could link to `neural-search.adoc` and/or
vice versa, not necessarily at this location, just generally as a way of
perhaps helping users discovers more ref guide content.
##########
File path: solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
##########
@@ -258,6 +246,48 @@ public static Document toDocument(SolrInputDocument doc,
IndexSchema schema, boo
return out;
}
+ private static boolean addOriginalField( Object originalFieldValue,
SchemaField sfield, boolean forInPlaceUpdate, Document out, Set<String>
usedFields) {
+ addField(out, sfield, originalFieldValue, forInPlaceUpdate);
+ // record the field as having a value
+ usedFields.add(sfield.getName());
+ return true;
+ }
+
+ private static boolean addCopyFields(Object originalFieldValue, FieldType
originalFieldType, List<CopyField> copyFields, boolean forInPlaceUpdate, String
uniqueKeyFieldName, Document out, Set<String> usedFields) {
Review comment:
minor: wrap long line(s)
##########
File path: solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
##########
@@ -151,63 +153,49 @@ public static Document toDocument(SolrInputDocument doc,
IndexSchema schema, boo
String name = field.getName();
SchemaField sfield = schema.getFieldOrNull(name);
- boolean used = false;
+ List<CopyField> copyFields = schema.getCopyFieldsList(name);
+ if( copyFields.size() == 0 ) copyFields = null;
+
// Make sure it has the correct number
- if( sfield!=null && !sfield.multiValued() && field.getValueCount() > 1 )
{
+ if( sfield!=null && !(sfield.getType() instanceof DenseVectorField) &&
!sfield.multiValued() && field.getValueCount() > 1 ) {
Review comment:
subjective: add new specialised clauses at the end with short-circuit
evaluation in mind
```suggestion
if( sfield!=null && !sfield.multiValued() && field.getValueCount() > 1
&& !(sfield.getType() instanceof DenseVectorField) ) {
```
##########
File path: solr/solr-ref-guide/src/neural-search.adoc
##########
@@ -0,0 +1,322 @@
+= Neural Search
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+Search comprises of performing four primary steps:
+
+* generate a representation of the query that specifies the information need
+* generate a representation of the document that captures the information
contained
+* match the query and the document representations from the corpus of
information
+* assign a score to each matched document in order to establish a meaningful
document ranking by relevance in the results
+
+The Apache Solr *Neural Search* module adds support for neural networks based
techniques that can improve various aspects of search.
+
+These techniques can be differentiated based on whether they affect the query
representation, the document representation, or the estimation of the relevance
score.
+
+Neural Search is an industry derivation from the academic field of
https://www.microsoft.com/en-us/research/uploads/prod/2017/06/fntir2018-neuralir-mitra.pdf[Neural
information Retrieval].
+
+== Neural Search Concepts
+
+=== Deep Learning
+
+More and more frequently, we hear about how Artificial Intelligence (AI)
permeates every aspect of our lives.
+
+When we talk about AI we are referring to a superset of techniques that enable
machines to learn and show intelligence like humans.
+
+Since computing power has strongly and steadily advanced in the recent past,
AI has seen a resurgence lately and it is now used in many domains, including
software engineering and Information Retrieval (the science that regulates
Search Engines and similar systems).
+
+In particular the advent of https://en.wikipedia.org/wiki/Deep_learning[Deep
Learning] introduced the use of deep neural networks to solve complex problems
that could not be solved simply by an algorithm.
+
+Deep Learning can be used to produce a vector representation of both the query
and the documents in a corpus of information.
+
+=== Dense Vector Representation
+A Dense vector describes information as an array of elements, each of them
explicitly defined.
+
+Various Deep Learning models such as
https://en.wikipedia.org/wiki/BERT_(language_model)[BERT] are able to encode
textual information as dense vectors, to be used for Dense Retrieval strategies.
+
+For additional information you can refer to this
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+=== Dense Retrieval
+Given a dense vector `v` that models the information need, the easiest
approach for providing dense vector retrieval would be to calculate the
distance(euclidean, dot product, etc.) between `v` and each vector `d` that
represents a document in the corpus of information.
+
+This approach is quite expensive, so many approximate strategies are currently
under active research.
+
+The strategy implemented in Apache Lucene and used by Apache Solr is based on
Navigable Small-world graph.
+
+It provides efficient approximate nearest neighbor search for high dimensional
vectors.
+
+See https://doi.org/10.1016/j.is.2013.10.006[Approximate nearest neighbor
algorithm based on navigable small world graphs [2014]] and
https://arxiv.org/abs/1603.09320[this paper [2018]] for details.
+
+
+== Index Time
+This is the list of Apache Solr field types designed to support Neural Search:
+
+=== DenseVectorField
+The Dense Vector field gives the possibility of indexing and searching dense
vectors of float elements.
+
+e.g.
+
+`[1.0, 2.5, 3.7, 4.1]` (array of float elements)
+
+Here's how `DenseVectorField` should be configured in the schema:
+
+[source,xml]
+<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4"
similarityFunction="cosine"/>
+<field name="vector" type="knn_vector" indexed="true" stored="true"/>
+
+`vectorDimension`::
++
+[%autowidth,frame=none]
+|===
+|Mandatory
+|===
++
+The dimension of the dense vector to pass in.
++
+Accepted values:
+Integer < = 1024.
+
+`similarityFunction`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: `euclidean`
+|===
++
+Vector similarity function; used in search to return top K most similar
vectors to a target vector.
++
+Accepted values: `euclidean`, `dot_product` or `cosine`.
+
+* `euclidean`: https://en.wikipedia.org/wiki/Euclidean_distance[Euclidean
distance]
+* `dot_product`: https://en.wikipedia.org/wiki/Dot_product[Dot product].
*NOTE*: this similarity is intended as an optimized way to perform cosine
similarity. In order to use it, all vectors must be of unit length, including
both document and query vectors. Using dot product with vectors that are not
unit length can result in errors or poor search results..
+* `cosine`: https://en.wikipedia.org/wiki/Cosine_similarity[Cosine
similarity]. *NOTE*: the preferred way to perform cosine similarity is to
normalize all vectors to unit length, and instead use DOT_PRODUCT. You should
only use this function if you need to preserve the original vectors and cannot
normalize them in advance.
+
+*N.B.* To use the following advanced parameters that customise the codec format
+and the hyper-parameter of the HNSW algorithm make sure you set this
configuration in the solrconfig.xml:
+[source,xml]
+<config>
+<codecFactory class="solr.SchemaCodecFactory"/>
+...
+
+Here's how `DenseVectorField` can be configured with the advanced codec
hyper-parameters:
+
+[source,xml]
+<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4"
similarityFunction="cosine" codecFormat="Lucene90HnswVectorsFormat"
hnswMaxConnections="10" hnswBeamWidth="40"/>
+<field name="vector" type="knn_vector" indexed="true" stored="true"/>
+
+`codecFormat`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: `Lucene90HnswVectorsFormat`
+|===
++
+(advanced) Specifies the knn codec implementation to use
++
+
+Accepted values: `Lucene90HnswVectorsFormat`.
+
+Please note that the `codecFormat` accepted values may change in future
releases.
+
+
+
+[NOTE]
+Lucene index back-compatibility is only supported for the default codec.
+If you choose to customize the `codecFormat` in your schema, upgrading to a
future version of Solr may require you to either switch back to the default
codec and optimize your index to rewrite it into the default codec before
upgrading, or re-build your entire index from scratch after upgrading.
+
+`hnswMaxConnections`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: 16
+|===
++
+(advanced) This parameter is specific for the `Lucene90HnswVectorsFormat`
codec format:
++
+Controls how many of the nearest neighbor candidates are connected to the new
node.
++
+See https://doi.org/10.1016/j.is.2013.10.006[Approximate nearest neighbor
algorithm based on navigable small world graphs [2014]] and
https://arxiv.org/abs/1603.09320[this paper [2018]] for details.
++
+It has the same meaning as `M` from the later paper.
++
+Accepted values:
+Integer.
+
+`hnswBeamWidth`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: 100
+|===
++
+(advanced) This parameter is specific for the `Lucene90HnswVectorsFormat`
codec format:
++
+It is the number of nearest neighbor candidates to track while searching the
graph for each newly inserted node.
++
+See https://doi.org/10.1016/j.is.2013.10.006[Approximate nearest neighbor
algorithm based on navigable small world graphs [2014]] and
https://arxiv.org/abs/1603.09320[this paper [2018]] for details.
++
+It has the same meaning as `efConstruction` from the later paper.
++
+Accepted values:
+Integer.
+
+DenseVectorField supports the attributes: `indexed`, `stored`.
+
+*N.B.* currently multivalue is not supported
+
+Here's how a `DenseVectorField` should be indexed:
+
+[.dynamic-tabs]
+--
+[example.tab-pane#json]
+====
+[.tab-label]*JSON*
+[source,json]
+----
+[{ "id": "1",
+"vector": [1.0, 2.5, 3.7, 4.1]
+},
+{ "id": "2",
+"vector": [1.5, 5.5, 6.7, 65.1]
+}
+]
+----
+====
+
+[example.tab-pane#xml]
+====
+[.tab-label]*XML*
+[source,xml]
+----
+<add>
+<doc>
+<field name="id">1</field>
+<field name="vector">1.0</field>
+<field name="vector">2.5</field>
+<field name="vector">3.7</field>
+<field name="vector">4.1</field>
+</doc>
+<doc>
+<field name="id">2</field>
+<field name="vector">1.5</field>
+<field name="vector">5.5</field>
+<field name="vector">6.7</field>
+<field name="vector">65.1</field>
+</doc>
+</add>
+----
+====
+
+[example.tab-pane#solrj]
+====
+[.tab-label]*SolrJ*
+[source,java,indent=0]
+----
+final SolrClient client = getSolrClient();
+
+final SolrInputDocument d1 = new SolrInputDocument();
+d1.setField("id", "1");
+d1.setField("vector", Arrays.asList(1.0f, 2.5f, 3.7f, 4.1f));
+
+
+final SolrInputDocument d2 = new SolrInputDocument();
+d2.setField("id", "2");
+d2.setField("vector", Arrays.asList(1.5f, 5.5f, 6.7f, 65.1f));
+
+client.add(Arrays.asList(d1, d2));
+----
+====
+--
+
+== Query Time
+This is the list of Apache Solr query approaches designed to support Neural
Search:
+
+=== knn Query Parser
+The `knn` K-Nearest Neighbors query parser allows to find the k-nearest
documents to the target vector according to indexed dense vectors in the given
field.
+
+It takes the following parameters:
+
+`f`::
++
+[%autowidth,frame=none]
+|===
+|Mandatory
+|===
++
+The DenseVectorField to search in.
+
+`topK`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: 10
+|===
++
+How many k-nearest results to return.
+
+Here's how to run a KNN search:
+
+[source,text]
+&q={!knn f=vector topK=10}[1.0, 2.0, 3.0, 4.0]
+
+The search results retrieved are the K-nearest to the vector in input `[1.0,
2.0, 3.0, 4.0]`, ranked by the similarityFunction configured at indexing time.
+
+==== Usage with Filter Queries
+The `knn` query parser can be used in filter queries:
+[source,text]
+&q=id:(1 2 3)&fq={!knn f=vector topK=10}[1.0, 2.0, 3.0, 4.0]
+
+The `knn` query parser can be used with filter queries:
+[source,text]
+&q={!knn f=vector topK=10}[1.0, 2.0, 3.0, 4.0]&fq=id:(1 2 3)
+
+[IMPORTANT]
+====
+When using `knn` in these scenarios make sure you have a clear understanding
of how filter queries work in Apache Solr:
+
+The Ranked List of document IDs resulting from the main query `q` is
intersected with the set of document IDs deriving from each filter query `fq`.
+
+e.g.
+
+Ranked List from `q`=`[ID1, ID4, ID2, ID10]` <intersects> Set from `fq`=`{ID3,
ID2, ID9, ID4}` = `[ID4,ID2]`
+====
+
+
+==== Usage as Re-Ranking Query
+The `knn` query parser can be used to rerank first pass query results:
+[source,text]
+&q=id:(3 4 9 2)&rq={!rerank reRankQuery=$rqq reRankDocs=4
reRankWeight=1}&rqq={!knn f=vector topK=10}[1.0, 2.0, 3.0, 4.0]
+
+[IMPORTANT]
+====
+When using `knn` in reranking pay attention to the `topK` parameter.
+
+The second pass score(deriving from knn) is calculated only if the document
`d` from the first pass is within
+the K-nearest neighbors(*in the whole index*) of the target vector to search.
+
+This means the second pass `knn` is executed on the whole index anyway, which
is a current limitation.
+
+The final ranked list of results will have the first pass score(main query
`q`) added to the second pass score(the approximated similarityFunction
distance to the target vector to search) multiplied by a multiplicative
factor(reRankWeight).
+
+Details about using the ReRank Query Parser can be found in the
<<query-re-ranking.adoc#,Query Re-Ranking>> section.
Review comment:
`query-re-ranking.adoc` could list `knn` also and link to
`neural-search.adoc` here.
##########
File path: solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.KnnVectorQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import static java.util.Optional.ofNullable;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+
+/**
+ * Provides a field type to support Lucene's {@link
+ * org.apache.lucene.document.KnnVectorField}.
+ * See {@link org.apache.lucene.search.KnnVectorQuery} for more details.
+ * It supports a fixed cardinality dimension for the vector and a fixed
similarity function.
+ * The default similarity is EUCLIDEAN_HNSW (L2).
+ * The default index codec format is specified in the Lucene Codec constructor.
+ * For Lucene 9.0 e.g.
+ * See {@link org.apache.lucene.codecs.lucene90.Lucene90Codec}
+ * Currently only {@link
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat} is supported for
+ * advanced hyper-parameter customisation.
+ * See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details about
the implementation.
+ *
+ * <br>
+ * Only {@code Indexed} and {@code Stored} attributes are supported.
+ */
+public class DenseVectorField extends FloatPointField {
+
+ static final String KNN_VECTOR_DIMENSION = "vectorDimension";
+ static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
+
+ static final String CODEC_FORMAT = "codecFormat";
+ static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
+ static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
+
+ int dimension;
+ VectorSimilarityFunction similarityFunction;
+ VectorSimilarityFunction DEFAULT_SIMILARITY =
VectorSimilarityFunction.EUCLIDEAN;
+
+ String codecFormat;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * Controls how many of the nearest neighbor candidates are connected to
the new node. Defaults to
+ * {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link
HnswGraph} for more details.
+ */
+ int hnswMaxConn;
+ /**
+ * This parameter is coupled with the {@link Lucene90HnswVectorsFormat}
format implementation.
+ * The number of candidate neighbors to track while searching the graph
for each newly inserted
+ * node. Defaults to to {@link
Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
+ * HnswGraph} for details.
+ */
+ int hnswBeamWidth;
+
+ @Override
+ public void init(IndexSchema schema, Map<String, String> args) {
+ this.dimension = ofNullable(args.get(KNN_VECTOR_DIMENSION))
+ .map(value -> Integer.parseInt(value))
+ .orElseThrow(() -> new
SolrException(SolrException.ErrorCode.SERVER_ERROR, "the vector dimension is a
mandatory parameter"));
+ args.remove(KNN_VECTOR_DIMENSION);
+
+ this.similarityFunction = ofNullable(args.get(KNN_SIMILARITY_FUNCTION))
+ .map(value ->
VectorSimilarityFunction.valueOf(value.toUpperCase(Locale.ROOT)))
+ .orElse(DEFAULT_SIMILARITY);
+ args.remove(KNN_SIMILARITY_FUNCTION);
+
+ this.codecFormat = args.get(CODEC_FORMAT);
+ args.remove(CODEC_FORMAT);
+
+ this.hnswMaxConn = ofNullable(args.get(HNSW_MAX_CONNECTIONS))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_MAX_CONN);
+ args.remove(HNSW_MAX_CONNECTIONS);
+
+ this.hnswBeamWidth = ofNullable(args.get(HNSW_BEAM_WIDTH))
+ .map(value -> Integer.parseInt(value))
+ .orElse(DEFAULT_BEAM_WIDTH);
+ args.remove(HNSW_BEAM_WIDTH);
+
+ this.properties &= ~MULTIVALUED;
+ this.properties &= ~UNINVERTIBLE;
+
+ super.init(schema, args);
+ }
+
+ public int getDimension() {
+ return dimension;
+ }
+
+ public String getCodecFormat() {
+ return codecFormat;
+ }
+
+ public Integer getHnswMaxConn() {
+ return hnswMaxConn;
+ }
+
+ public Integer getHnswBeamWidth() {
+ return hnswBeamWidth;
+ }
+
+ @Override
+ public void checkSchemaField(final SchemaField field) throws SolrException
{
+ super.checkSchemaField(field);
+ if (field.multiValued()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not be
multiValued: " + field.getName());
+ }
+
+ if (field.hasDocValues()) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+ getClass().getSimpleName() + " fields can not have
docValues: " + field.getName());
+ }
+ }
+
+ public List<IndexableField> createFields(SchemaField field, Object value) {
+ List<IndexableField> fields = new ArrayList<>();
+ float[] parsedVector;
+ try {
+ parsedVector = parseVector(value);
+ } catch (RuntimeException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error while creating field '" + field + "' from value '" + value + "',
expected format:'[f1, f2, f3...fn]' e.g. [1.0, 3.4, 5.6]", e);
+ }
+
+ if (field.indexed()) {
+ fields.add(createField(field, parsedVector));
+ }
+ if (field.stored()) {
+ for (float vectorElement : parsedVector) {
+ fields.add(getStoredField(field, vectorElement));
+ }
+ }
+ return fields;
+ }
+
+ @Override
+ public IndexableField createField(SchemaField field, Object parsedVector) {
+ float[] typedVector;
+ if (parsedVector == null) return null;
+ typedVector = (float[]) parsedVector;
+ return new KnnVectorField(field.getName(), typedVector,
similarityFunction);
+ }
+
+ /**
+ * Index Time Parsing
+ * The inputValue is an ArrayList with a type that dipends on the loader
used:
+ * - {@link org.apache.solr.handler.loader.XMLLoader}, {@link
org.apache.solr.handler.loader.CSVLoader} produces an ArrayList of String
+ * - {@link org.apache.solr.handler.loader.JsonLoader} produces an
ArrayList of Double
+ * - {@link org.apache.solr.handler.loader.JavabinLoader} produces an
ArrayList of Float
+ *
+ * @param inputValue - An {@link ArrayList} containing the elements of the
vector
+ * @return the vector parsed
+ */
+ float[] parseVector(Object inputValue) {
+ if (!(inputValue instanceof List)) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"incorrect vector format." +
+ " The expected format is an array :'[f1,f2..f3]' where
each element f is a float");
+ }
+ List<?> inputVector = (List<?>) inputValue;
+ if (inputVector.size() != dimension) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"incorrect vector dimension." +
+ " The vector value has size "
+ + inputVector.size() + " while it is expected a vector
with size " + dimension);
+ }
+
+ float[] vector = new float[dimension];
+ if (inputVector.get(0) instanceof CharSequence) {
+ for (int i = 0; i < dimension; i++) {
+ try {
+ vector[i] =
Float.parseFloat(inputVector.get(i).toString());
+ } catch (NumberFormatException e) {
+ throw new
SolrException(SolrException.ErrorCode.BAD_REQUEST, "incorrect vector element:
'" + inputVector.get(i) +
+ "'. The expected format is:'[f1,f2..f3]' where
each element f is a float");
+ }
+ }
+ } else if (inputVector.get(0) instanceof Number) {
+ for (int i = 0; i < dimension; i++) {
+ vector[i] = ((Number) inputVector.get(i)).floatValue();
+ }
+ }
+
+ return vector;
+ }
+
+ @Override
+ public UninvertingReader.Type getUninversionType(SchemaField sf) {
+ return null;
+ }
+
+ @Override
+ public ValueSource getValueSource(SchemaField field, QParser parser) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+ "Function queries are not supported for Dense Vector fields.");
+ }
+
+ public Query getKnnVectorQuery(SchemaField field, float[] vectorToSearch,
int topK) {
+ return new KnnVectorQuery(field.getName(), vectorToSearch, topK);
+ }
+
+ /**
+ * Not Supported
Review comment:
It's nice to have the `Please use the {!knn} query parser ...` signpost
in the exception that is thrown. Perhaps the javadocs here could also reference
or link it.
##########
File path: solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
##########
@@ -151,63 +153,49 @@ public static Document toDocument(SolrInputDocument doc,
IndexSchema schema, boo
String name = field.getName();
SchemaField sfield = schema.getFieldOrNull(name);
- boolean used = false;
+ List<CopyField> copyFields = schema.getCopyFieldsList(name);
+ if( copyFields.size() == 0 ) copyFields = null;
+
// Make sure it has the correct number
- if( sfield!=null && !sfield.multiValued() && field.getValueCount() > 1 )
{
+ if( sfield!=null && !(sfield.getType() instanceof DenseVectorField) &&
!sfield.multiValued() && field.getValueCount() > 1 ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
- "ERROR: "+getID(doc, schema)+"multiple values encountered for non
multiValued field " +
- sfield.getName() + ": " +field.getValue() );
+ "ERROR: "+getID(doc, schema)+"multiple values encountered for
non multiValued field " +
+ sfield.getName() + ": " +field.getValue() );
}
- List<CopyField> copyFields = schema.getCopyFieldsList(name);
- if( copyFields.size() == 0 ) copyFields = null;
-
Review comment:
could `copyFields` remain here? it moved 'up' but at a glance appears to
not be needed any earlier?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]