Tim-Brooks commented on code in PR #15990: URL: https://github.com/apache/lucene/pull/15990#discussion_r3185890623
########## lucene/core/src/test/org/apache/lucene/index/TestColumnBatchIndexing.java: ########## @@ -0,0 +1,2752 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.FloatPoint; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StoredValue; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.document.column.BinaryColumn; +import org.apache.lucene.document.column.BinaryTupleCursor; +import org.apache.lucene.document.column.Column; +import org.apache.lucene.document.column.ColumnBatch; +import org.apache.lucene.document.column.LongColumn; +import org.apache.lucene.document.column.LongTupleCursor; +import org.apache.lucene.document.column.LongValuesCursor; +import org.apache.lucene.document.column.VectorColumn; +import org.apache.lucene.document.column.VectorTupleCursor; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; + +/** Tests for column-oriented batch indexing via {@link IndexWriter#addBatch}. */ +public class TestColumnBatchIndexing extends LuceneTestCase { + + public void testNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + long[] values = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + w.addBatch( + simpleBatch(3, new ArrayLongColumn("numeric", NumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("numeric"); + for (int i = 0; i < values.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSortedNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Doc 0 has two values, doc 1 has one value + int[] docIds = {0, 0, 1}; + long[] values = {5, 15, 25}; + w.addBatch( + simpleBatch( + 2, + new ArrayLongColumn( + "sortedNumeric", SortedNumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("sortedNumeric"); + + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(5, dv.nextValue()); + assertEquals(15, dv.nextValue()); + + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(25, dv.nextValue()); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testBinaryDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + BytesRef[] values = {newBytesRef("aaa"), newBytesRef("bbb"), newBytesRef("ccc")}; + int[] docIds = {0, 1, 2}; + w.addBatch( + simpleBatch(3, new ArrayBinaryColumn("binary", BinaryDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + BinaryDocValues dv = leaf.getBinaryDocValues("binary"); + for (int i = 0; i < values.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.binaryValue()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSortedDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + BytesRef[] values = {newBytesRef("x"), newBytesRef("y"), newBytesRef("x")}; + int[] docIds = {0, 1, 2}; + w.addBatch( + simpleBatch(3, new ArrayBinaryColumn("sorted", SortedDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + SortedDocValues dv = leaf.getSortedDocValues("sorted"); + + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef("x"), dv.lookupOrd(dv.ordValue())); + assertEquals(1, dv.nextDoc()); + assertEquals(newBytesRef("y"), dv.lookupOrd(dv.ordValue())); + assertEquals(2, dv.nextDoc()); + assertEquals(newBytesRef("x"), dv.lookupOrd(dv.ordValue())); + + // "x" and "y" should share ord space + assertEquals(2, dv.getValueCount()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSortedSetDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Doc 0 has two values, doc 1 has one value + int[] docIds = {0, 0, 1}; + BytesRef[] values = {newBytesRef("a"), newBytesRef("b"), newBytesRef("a")}; + w.addBatch( + simpleBatch( + 2, new ArrayBinaryColumn("sortedSet", SortedSetDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + SortedSetDocValues dv = leaf.getSortedSetDocValues("sortedSet"); + + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(newBytesRef("a"), dv.lookupOrd(dv.nextOrd())); + assertEquals(newBytesRef("b"), dv.lookupOrd(dv.nextOrd())); + + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(newBytesRef("a"), dv.lookupOrd(dv.nextOrd())); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testMultipleColumns() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + int[] allDocs = {0, 1, 2}; + long[] numericValues = {100, 200, 300}; + BytesRef[] sortedValues = {newBytesRef("a"), newBytesRef("b"), newBytesRef("c")}; + + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("numeric", NumericDocValuesField.TYPE, allDocs, numericValues), + new ArrayBinaryColumn("sorted", SortedDocValuesField.TYPE, allDocs, sortedValues))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + NumericDocValues ndv = leaf.getNumericDocValues("numeric"); + SortedDocValues sdv = leaf.getSortedDocValues("sorted"); + for (int i = 0; i < 3; i++) { + assertEquals(i, ndv.nextDoc()); + assertEquals(numericValues[i], ndv.longValue()); + assertEquals(i, sdv.nextDoc()); + assertEquals(sortedValues[i], sdv.lookupOrd(sdv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testSparseDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Only doc 1 has a value (docs 0 and 2 are missing) + int[] docIds = {1}; + long[] values = {42}; + w.addBatch( + simpleBatch(3, new ArrayLongColumn("sparse", NumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("sparse"); + assertEquals(1, dv.nextDoc()); + assertEquals(42, dv.longValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testParentFieldIndexed() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(); + config.setParentField("_parent"); + IndexWriter w = new IndexWriter(dir, config); + + int[] docIds = {0, 1, 2}; + long[] values = {1, 2, 3}; + w.addBatch( + simpleBatch(3, new ArrayLongColumn("numeric", NumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Every batch doc should have the parent field + NumericDocValues parentDv = leaf.getNumericDocValues("_parent"); + assertNotNull(parentDv); + for (int i = 0; i < 3; i++) { + assertEquals(i, parentDv.nextDoc()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testPointsColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Create a points-only FieldType (1 dimension, Integer.BYTES) + FieldType pointType = new FieldType(); + pointType.setDimensions(1, Integer.BYTES); + pointType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("point", pointType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 10))); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 20))); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 30))); + assertEquals(0, searcher.count(IntPoint.newExactQuery("point", 99))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("point", 10, 30))); + + r.close(); + w.close(); + dir.close(); + } + + public void testPointsWithDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 1D int points + SORTED_NUMERIC DV via the compat layer. + FieldType pointAndDvType = new FieldType(); + pointAndDvType.setDimensions(1, Integer.BYTES); + pointAndDvType.setDocValuesType(DocValuesType.SORTED_NUMERIC); + pointAndDvType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "field", pointAndDvType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + assertEquals(1, searcher.count(IntPoint.newExactQuery("field", 10))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("field", 10, 30))); + + LeafReader leaf = getOnlyLeafReader(r); + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.nextValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testSparsePointsColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(1, Integer.BYTES); + pointType.freeze(); + + // Only doc 1 out of 3 has a point value + int[] docIds = {1}; + long[] values = {42}; + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("point", pointType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 42))); + assertEquals(0, searcher.count(IntPoint.newExactQuery("point", 0))); + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + NUMERIC doc values + FieldType storedNumericType = new FieldType(); + storedNumericType.setStored(true); + storedNumericType.setDocValuesType(DocValuesType.NUMERIC); + storedNumericType.freeze(); + + int[] docIds = {0, 1, 2}; + long[] values = {100, 200, 300}; + w.addBatch(simpleBatch(3, new ArrayLongColumn("val", storedNumericType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("val").numericValue().longValue()); + } + + // Verify doc values + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredBinaryColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + SORTED doc values + FieldType storedSortedType = new FieldType(); + storedSortedType.setStored(true); + storedSortedType.setDocValuesType(DocValuesType.SORTED); + storedSortedType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("aaa"), newBytesRef("bbb"), newBytesRef("ccc")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("val", storedSortedType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("val").binaryValue()); + } + + // Verify doc values + SortedDocValues dv = leaf.getSortedDocValues("val"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredOnlyColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored only — no doc values, no points + FieldType storedOnlyType = new FieldType(); + storedOnlyType.setStored(true); + storedOnlyType.freeze(); + + int[] docIds = {0, 1, 2}; + long[] values = {10, 20, 30}; + w.addBatch(simpleBatch(3, new ArrayLongColumn("stored", storedOnlyType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("stored").numericValue().longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testMixedStoredAndNonStoredColumns() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedNumericType = new FieldType(); + storedNumericType.setStored(true); + storedNumericType.setDocValuesType(DocValuesType.NUMERIC); + storedNumericType.freeze(); + + int[] allDocs = {0, 1, 2}; + long[] storedValues = {100, 200, 300}; + long[] dvOnlyValues = {1, 2, 3}; + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("stored_field", storedNumericType, allDocs, storedValues), + new ArrayLongColumn("dv_only", NumericDocValuesField.TYPE, allDocs, dvOnlyValues))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored field + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(storedValues[i], doc.getField("stored_field").numericValue().longValue()); + assertNull(doc.getField("dv_only")); // non-stored column should not appear + } + + // Verify both doc values columns + NumericDocValues storedDv = leaf.getNumericDocValues("stored_field"); + NumericDocValues dvOnly = leaf.getNumericDocValues("dv_only"); + for (int i = 0; i < 3; i++) { + assertEquals(i, storedDv.nextDoc()); + assertEquals(storedValues[i], storedDv.longValue()); + assertEquals(i, dvOnly.nextDoc()); + assertEquals(dvOnlyValues[i], dvOnly.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredPointsColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + points + FieldType storedPointType = new FieldType(); + storedPointType.setStored(true); + storedPointType.setDimensions(1, Integer.BYTES); + storedPointType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "pt", storedPointType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields — decoded as ints. + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + Document doc = storedFields.document(i); + assertEquals(raw[i], doc.getField("pt").numericValue().intValue()); + } + + // Verify points + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("pt", 10))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("pt", 10, 30))); + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // StringField-like: DOCS, omitNorms, non-tokenized + FieldType stringType = new FieldType(); + stringType.setIndexOptions(IndexOptions.DOCS); + stringType.setOmitNorms(true); + stringType.setTokenized(false); + stringType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("alpha"), newBytesRef("beta"), newBytesRef("alpha")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("tag", stringType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(2, searcher.count(new TermQuery(new Term("tag", "alpha")))); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "beta")))); + assertEquals(0, searcher.count(new TermQuery(new Term("tag", "gamma")))); + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedWithDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Inverted + SORTED doc values (like a StringField with doc values) + FieldType invertedDvType = new FieldType(); + invertedDvType.setIndexOptions(IndexOptions.DOCS); + invertedDvType.setOmitNorms(true); + invertedDvType.setTokenized(false); + invertedDvType.setDocValuesType(DocValuesType.SORTED); + invertedDvType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("x"), newBytesRef("y"), newBytesRef("x")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", invertedDvType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify inverted index + assertEquals(2, searcher.count(new TermQuery(new Term("field", "x")))); + assertEquals(1, searcher.count(new TermQuery(new Term("field", "y")))); + + // Verify doc values + LeafReader leaf = getOnlyLeafReader(r); + SortedDocValues dv = leaf.getSortedDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedWithStored() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Inverted + stored (like StringField with Store.YES) + FieldType invertedStoredType = new FieldType(StringField.TYPE_STORED); + invertedStoredType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("aaa"), newBytesRef("bbb"), newBytesRef("ccc")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", invertedStoredType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify inverted index + assertEquals(1, searcher.count(new TermQuery(new Term("field", "aaa")))); + assertEquals(1, searcher.count(new TermQuery(new Term("field", "bbb")))); + + // Verify stored fields + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("field").binaryValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedWithStoredAndDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Inverted + stored + SORTED doc values + FieldType allType = new FieldType(); + allType.setIndexOptions(IndexOptions.DOCS); + allType.setOmitNorms(true); + allType.setTokenized(false); + allType.setStored(true); + allType.setDocValuesType(DocValuesType.SORTED); + allType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("x"), newBytesRef("y"), newBytesRef("z")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", allType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify inverted index + assertEquals(1, searcher.count(new TermQuery(new Term("field", "x")))); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + assertEquals(values[i], storedFields.document(i).getField("field").binaryValue()); + } + + // Verify doc values + SortedDocValues dv = leaf.getSortedDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedSparse() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType stringType = new FieldType(); + stringType.setIndexOptions(IndexOptions.DOCS); + stringType.setOmitNorms(true); + stringType.setTokenized(false); + stringType.freeze(); + + // Only doc 1 out of 3 has a term + int[] docIds = {1}; + BytesRef[] values = {newBytesRef("found")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("tag", stringType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "found")))); + + r.close(); + w.close(); + dir.close(); + } + + public void testTokenizedColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w = new IndexWriter(dir, config); + + // TextField-like: tokenized, DOCS_AND_FREQS_AND_POSITIONS + int[] docIds = {0, 1, 2}; + BytesRef[] values = { + newBytesRef("quick brown fox"), newBytesRef("lazy brown dog"), newBytesRef("quick fox jumps") + }; + w.addBatch( + simpleBatch(3, new ArrayBinaryColumn("text", TextField.TYPE_NOT_STORED, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + // Each word was tokenized — verify individual terms + assertEquals(2, searcher.count(new TermQuery(new Term("text", "quick")))); + assertEquals(2, searcher.count(new TermQuery(new Term("text", "brown")))); + assertEquals(2, searcher.count(new TermQuery(new Term("text", "fox")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "lazy")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "dog")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "jumps")))); + assertEquals(0, searcher.count(new TermQuery(new Term("text", "missing")))); + + r.close(); + w.close(); + dir.close(); + } + + public void testTokenizedWithStored() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w = new IndexWriter(dir, config); + + int[] docIds = {0, 1}; + BytesRef[] values = {newBytesRef("hello world"), newBytesRef("goodbye world")}; + w.addBatch( + simpleBatch(2, new ArrayBinaryColumn("text", TextField.TYPE_STORED, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify tokenized search + assertEquals(2, searcher.count(new TermQuery(new Term("text", "world")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "hello")))); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + assertEquals(values[0], storedFields.document(0).getField("text").binaryValue()); + assertEquals(values[1], storedFields.document(1).getField("text").binaryValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testColumnWithNoneDocValuesTypeAndNoPointsThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // FieldType with NONE doc values type and no points + FieldType badType = new FieldType(); + badType.freeze(); + + int[] docIds = {0}; + long[] values = {1}; + expectThrows( + IllegalArgumentException.class, + () -> w.addBatch(simpleBatch(1, new ArrayLongColumn("bad", badType, docIds, values)))); + + // Writer should still be usable after the failure + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "numeric", NumericDocValuesField.TYPE, new int[] {0}, new long[] {42}))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("numeric"); + assertNotNull(dv); + // The failed batch's doc was marked deleted; the successful batch's doc is still live + int doc = dv.nextDoc(); + assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS); + assertEquals(42, dv.longValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredWithDocValuesAndPoints() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + SORTED_NUMERIC DV + 4-byte points + FieldType allType = new FieldType(); + allType.setStored(true); + allType.setDocValuesType(DocValuesType.SORTED_NUMERIC); + allType.setDimensions(1, Integer.BYTES); + allType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, new ArrayLongColumn("field", allType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields — decoded as ints. + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("field").numericValue().intValue()); + } + + // Verify doc values (raw int widened to long). + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("field"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.nextValue()); + } + + // Verify points + assertEquals(3, leaf.getPointValues("field").size()); + + r.close(); + w.close(); + dir.close(); + } + + public void testMultiValuedStoredWithDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + SORTED_NUMERIC doc values (multi-valued) + FieldType storedSortedNumericType = new FieldType(); + storedSortedNumericType.setStored(true); + storedSortedNumericType.setDocValuesType(DocValuesType.SORTED_NUMERIC); + storedSortedNumericType.freeze(); + + // Doc 0 has two values (10, 20), doc 1 has one value (30) + int[] docIds = {0, 0, 1}; + long[] values = {10, 20, 30}; + w.addBatch(simpleBatch(2, new ArrayLongColumn("val", storedSortedNumericType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields — each value occurrence is stored separately + StoredFields storedFields = leaf.storedFields(); + Document doc0 = storedFields.document(0); + assertEquals(2, doc0.getFields("val").length); + assertEquals(10L, doc0.getFields("val")[0].numericValue().longValue()); + assertEquals(20L, doc0.getFields("val")[1].numericValue().longValue()); + Document doc1 = storedFields.document(1); + assertEquals(1, doc1.getFields("val").length); + assertEquals(30L, doc1.getFields("val")[0].numericValue().longValue()); + + // Verify doc values + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(10, dv.nextValue()); + assertEquals(20, dv.nextValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(30, dv.nextValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + long[] values = {100, 200, 300}; + w.addBatch(simpleBatch(3, new ArrayDenseLongColumn("val", NumericDocValuesField.TYPE, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseSortedNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + long[] values = {10, 20, 30, 40, 50}; + w.addBatch( + simpleBatch(5, new ArrayDenseLongColumn("val", SortedNumericDocValuesField.TYPE, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < 5; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(values[i], dv.nextValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseColumnCountMismatchThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 2 values but batch expects 3 documents + long[] values = {10, 20}; + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 3, new ArrayDenseLongColumn("val", NumericDocValuesField.TYPE, values)))); + + // Writer should still be usable after the failure — use a different field to avoid + // the partially-written DV entries from the failed batch + w.addBatch( + simpleBatch( + 1, new ArrayDenseLongColumn("val2", NumericDocValuesField.TYPE, new long[] {42}))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val2"); + assertNotNull(dv); + int doc = dv.nextDoc(); + assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS); + assertEquals(42, dv.longValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseColumnTooManyValuesThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 5 values but batch only has 3 documents + long[] values = {10, 20, 30, 40, 50}; + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 3, new ArrayDenseLongColumn("val", NumericDocValuesField.TYPE, values)))); + + // Writer should still be usable — no values were written past numDocs + w.addBatch( + simpleBatch( + 1, new ArrayDenseLongColumn("val2", NumericDocValuesField.TYPE, new long[] {42}))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val2"); + assertNotNull(dv); + int doc = dv.nextDoc(); + assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS); + assertEquals(42, dv.longValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testIntSparseNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Sparse: only docs 0 and 2 have values. + int[] docIds = {0, 2}; + int[] raw = {-7, 9}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", NumericDocValuesField.TYPE, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val"); + assertEquals(0, dv.nextDoc()); + assertEquals(-7, dv.longValue()); + assertEquals(2, dv.nextDoc()); + assertEquals(9, dv.longValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testLongColumnPointWidthMismatchThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(1, Integer.BYTES); // expects 4 bytes + pointType.freeze(); + + // LONG kind implies 8-byte point bytes; should fail validation against a 4-byte point type. + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "pt", + pointType, + LongColumn.NumericKind.LONG, + new int[] {0}, + new long[] {1})))); + + w.close(); + dir.close(); + } + + public void testBinaryColumnNumericDVBadFixedSizeThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Variable-size binary into NUMERIC DV should fail validation (fixedSize=-1). + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayBinaryColumn( + "val", + NumericDocValuesField.TYPE, + new int[] {0}, + new BytesRef[] {newBytesRef("x")})))); + + w.close(); + dir.close(); + } + + public void testLongColumnMultiDimPointsThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(2, Long.BYTES); + pointType.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayLongColumn("pt", pointType, new int[] {0}, new long[] {1})))); + + w.close(); + dir.close(); + } + + public void testDenseLongColumnWithStoredFields() throws IOException { + // Covers the "single column consumed by both passes via fresh cursors" case: a dense + // LongColumn with stored+numeric DV. Row pass uses tuples(), column pass uses values(). + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedNumericType = new FieldType(); + storedNumericType.setStored(true); + storedNumericType.setDocValuesType(DocValuesType.NUMERIC); + storedNumericType.freeze(); + + long[] values = {100, 200, 300, 400}; + w.addBatch(simpleBatch(4, new ArrayDenseLongColumn("val", storedNumericType, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < values.length; i++) { + assertEquals(values[i], storedFields.document(i).getField("val").numericValue().longValue()); + } + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < values.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeIntegerFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + int[] raw = {1, -2, 3}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.INT, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").numericValue().intValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeLongFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + long[] raw = {Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + w.addBatch(simpleBatch(3, new ArrayLongColumn("val", type, new int[] {0, 1, 2}, raw.clone()))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").numericValue().longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeFloatFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + float[] raw = {1.5f, -2.25f, Float.MAX_VALUE}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.FLOAT, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().floatValue(), 0f); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeDoubleFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + double[] raw = {1.5d, -2.25d, Double.MAX_VALUE}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = NumericUtils.doubleToSortableLong(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.DOUBLE, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().doubleValue(), 0d); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeStringFromBinaryColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + String[] raw = {"hello", "wörld", "🦜"}; + BytesRef[] values = new BytesRef[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = newBytesRef(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayBinaryColumn( + "val", type, new int[] {0, 1, 2}, values, StoredValue.Type.STRING))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").stringValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeFloatWithNumericDV() throws IOException { + // FLOAT kind on a LongColumn that also feeds NumericDV. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + float[] raw = {1.5f, -2.25f, 42.0f}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.FLOAT, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Stored values decoded as floats. + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().floatValue(), 0f); + } + + // NumericDV stores the sortable-int encoding sign-extended to long. + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeIntegerWithNumericDV() throws IOException { + // INT kind on a LongColumn that also feeds NumericDV. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + int[] raw = {Integer.MIN_VALUE, -1, 0, 42, Integer.MAX_VALUE}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").numericValue().intValue()); + } + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeDoubleWithNumericDV() throws IOException { + // DOUBLE kind on a LongColumn that also feeds NumericDV. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + double[] raw = {Double.NEGATIVE_INFINITY, -1.5d, 0.0d, 2.25d, Double.POSITIVE_INFINITY}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.doubleToSortableLong(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.DOUBLE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().doubleValue(), 0d); + } + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeDataInputRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayBinaryColumn( + "val", + type, + new int[] {0}, + new BytesRef[] {newBytesRef("x")}, + StoredValue.Type.DATA_INPUT)))); + + w.close(); + dir.close(); + } + + public void testBinaryColumnMultiDimPointsOnly() throws IOException { + // Plain BinaryColumn with 2-D int points (fixedSize = 2 * 4 = 8). Caller pre-packs bytes via + // IntPoint.pack; the chain writes them to points unchanged. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(2, Integer.BYTES); + pointType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {IntPoint.pack(1, 10), IntPoint.pack(2, 20), IntPoint.pack(3, 30)}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("pt", pointType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + 1, searcher.count(IntPoint.newRangeQuery("pt", new int[] {1, 10}, new int[] {1, 10}))); + assertEquals( + 3, searcher.count(IntPoint.newRangeQuery("pt", new int[] {0, 0}, new int[] {10, 100}))); + + r.close(); + w.close(); + dir.close(); + } + + public void testBinaryColumnPointsOnlyArbitraryWidth() throws IOException { + // 3-D int points (12 bytes) via plain BinaryColumn — arbitrary widths are fine for the + // opaque-bytes path since no numeric transform is applied. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(3, Integer.BYTES); + type.freeze(); + + int[][] raw = {{1, 2, 3}, {4, 5, 6}, {10, 20, 30}}; + BytesRef[] values = new BytesRef[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = IntPoint.pack(raw[i]); + } + w.addBatch(simpleBatch(raw.length, new ArrayBinaryColumn("pt", type, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + 1, searcher.count(IntPoint.newRangeQuery("pt", new int[] {1, 2, 3}, new int[] {1, 2, 3}))); + assertEquals( + 3, + searcher.count( + IntPoint.newRangeQuery("pt", new int[] {0, 0, 0}, new int[] {100, 100, 100}))); + + r.close(); + w.close(); + dir.close(); + } + + public void testBinaryColumnSortedDVAndPoints() throws IOException { + // Plain BinaryColumn with SORTED DV + 1-D int point. Same BytesRef goes to both writers. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Integer.BYTES); + type.setDocValuesType(DocValuesType.SORTED); + type.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {IntPoint.pack(10), IntPoint.pack(20), IntPoint.pack(30)}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", type, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedDocValues dv = leaf.getSortedDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("field", 10))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("field", 10, 30))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindIntPointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Integer.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + int[] raw = {-5, -1, 0, 7, Integer.MAX_VALUE}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(raw[i], dv.nextValue()); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, + searcher.count(IntPoint.newRangeQuery("val", Integer.MIN_VALUE, Integer.MAX_VALUE))); + assertEquals(1, searcher.count(IntPoint.newExactQuery("val", -5))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("val", -1, 7))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindLongPointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Long.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + long[] raw = {Long.MIN_VALUE, -100L, 0L, 42L, Long.MAX_VALUE}; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + } + w.addBatch(simpleBatch(raw.length, new ArrayLongColumn("val", type, docIds, raw.clone()))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.nextValue()); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, searcher.count(LongPoint.newRangeQuery("val", Long.MIN_VALUE, Long.MAX_VALUE))); + assertEquals(1, searcher.count(LongPoint.newExactQuery("val", Long.MIN_VALUE))); + assertEquals(3, searcher.count(LongPoint.newRangeQuery("val", -100L, 42L))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindFloatPointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Float.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + float[] raw = {Float.NEGATIVE_INFINITY, -1.5f, 0.0f, 2.25f, Float.POSITIVE_INFINITY}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.FLOAT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // DV stores the sortable-int encoding; decode via sortableIntToFloat. + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], NumericUtils.sortableIntToFloat((int) dv.nextValue()), 0f); + } + + // Points sort numerically. + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, + searcher.count( + FloatPoint.newRangeQuery("val", Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY))); + assertEquals(1, searcher.count(FloatPoint.newExactQuery("val", -1.5f))); + assertEquals(3, searcher.count(FloatPoint.newRangeQuery("val", -1.5f, 2.25f))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindDoublePointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Double.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + double[] raw = {Double.NEGATIVE_INFINITY, -1.5d, 0.0d, 2.25d, Double.POSITIVE_INFINITY}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.doubleToSortableLong(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.DOUBLE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], NumericUtils.sortableLongToDouble(dv.nextValue()), 0d); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, + searcher.count( + DoublePoint.newRangeQuery("val", Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(1, searcher.count(DoublePoint.newExactQuery("val", -1.5d))); + assertEquals(3, searcher.count(DoublePoint.newRangeQuery("val", -1.5d, 2.25d))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindPointsAndDVMultiDimRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 2D int: scenario 3 requires 1D. + FieldType type = new FieldType(); + type.setDimensions(2, Integer.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "val", + type, + LongColumn.NumericKind.LONG, + new int[] {0}, + new long[] {1L})))); + + w.close(); + dir.close(); + } + + public void testNumericKindPointsAndDVWidthMismatch() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // FLOAT kind requires a 4-byte point field; declaring Long.BYTES should throw. + FieldType type = new FieldType(); + type.setDimensions(1, Long.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "val", + type, + LongColumn.NumericKind.FLOAT, + new int[] {0}, + new long[] {1L})))); + + w.close(); + dir.close(); + } + + public void testNumericKindFloatDVOnly() throws IOException { + // DV only (no points): LongColumn stores the long value unchanged. For FLOAT, callers feed + // sortable-int bits in the low 32 bits, and DV reads them back sign-extended to long. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + float[] raw = {1.5f, -2.25f, Float.MAX_VALUE}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.FLOAT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + /** + * With a sparse row column, the batch must still produce {@code numDocs} documents in the + * segment, and stored-fields for un-populated docs must be empty (not shifted, not missing). This + * guards the row-dense framing contract: every doc-id in {@code [0, numDocs)} is framed + * regardless of whether any row column has a value at that doc. + */ + public void testSparseStoredFramingPreservesNumDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // 5 batch docs, but only docs 1 and 3 have a stored value. + int[] docIds = {1, 3}; + BytesRef[] values = {newBytesRef("one"), newBytesRef("three")}; + w.addBatch(simpleBatch(5, new ArrayBinaryColumn("field", storedOnly, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(5, leaf.maxDoc()); + + StoredFields storedFields = leaf.storedFields(); + assertNull(storedFields.document(0).getField("field")); + assertEquals(newBytesRef("one"), storedFields.document(1).getField("field").binaryValue()); + assertNull(storedFields.document(2).getField("field")); + assertEquals(newBytesRef("three"), storedFields.document(3).getField("field").binaryValue()); + assertNull(storedFields.document(4).getField("field")); + + r.close(); + w.close(); + dir.close(); + } + + /** + * With a sparse indexed row column, the segment must still have {@code numDocs} documents, and + * the inverted index must reflect only the populated docs. Guards termsHash framing alignment. + */ + public void testSparseIndexedFramingPreservesNumDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType indexedType = new FieldType(); + indexedType.setIndexOptions(IndexOptions.DOCS); + indexedType.setOmitNorms(true); + indexedType.setTokenized(false); + indexedType.freeze(); + + // 6 batch docs, only docs 2 and 5 have a term. + int[] docIds = {2, 5}; + BytesRef[] values = {newBytesRef("a"), newBytesRef("b")}; + w.addBatch(simpleBatch(6, new ArrayBinaryColumn("tag", indexedType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(6, leaf.maxDoc()); + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "a")))); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "b")))); + + r.close(); + w.close(); + dir.close(); + } + + /** + * When some docs in the batch have only a DV column (no row column value), framing still happens + * for every doc: stored fields must be empty for those docs, inverted index untouched, and DV + * values align with their batch doc-ids. + */ + public void testSparseRowMixedWithDenseDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // Row-sparse stored column: only docs 0 and 3 have a stored value. + int[] storedDocIds = {0, 3}; + BytesRef[] storedValues = {newBytesRef("first"), newBytesRef("fourth")}; + // Dense DV column covering every doc. + long[] dvValues = {100, 200, 300, 400}; + + w.addBatch( + simpleBatch( + 4, + new ArrayBinaryColumn("stored", storedOnly, storedDocIds, storedValues), + new ArrayDenseLongColumn("dv", NumericDocValuesField.TYPE, dvValues))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(4, leaf.maxDoc()); + + StoredFields storedFields = leaf.storedFields(); + assertEquals(newBytesRef("first"), storedFields.document(0).getField("stored").binaryValue()); + assertNull(storedFields.document(1).getField("stored")); + assertNull(storedFields.document(2).getField("stored")); + assertEquals(newBytesRef("fourth"), storedFields.document(3).getField("stored").binaryValue()); + + NumericDocValues dv = leaf.getNumericDocValues("dv"); + for (int i = 0; i < dvValues.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(dvValues[i], dv.longValue()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + /** + * Indexing the same logical docs via {@code addBatch} with a sparse row column vs. via {@code + * addDocument} one doc at a time must produce segments with the same {@code maxDoc} and the same + * stored-field / inverted-index visibility. This is the golden equivalence check. + */ + public void testSparseBatchMatchesDocByDoc() throws IOException { + FieldType storedIndexed = new FieldType(StringField.TYPE_STORED); + storedIndexed.freeze(); + + // 7 docs; only docs 1, 2, and 5 have values for the row column. + int[] docIds = {1, 2, 5}; + String[] values = {"alpha", "beta", "gamma"}; + int totalDocs = 7; + + // --- Batch path --- + Directory batchDir = newDirectory(); + try (IndexWriter batchW = new IndexWriter(batchDir, newIndexWriterConfig())) { + BytesRef[] refs = new BytesRef[values.length]; + for (int i = 0; i < values.length; i++) { + refs[i] = newBytesRef(values[i]); + } + // StringField stores as STRING — use the matching storedType so stored-value round-trip is + // comparable between the two paths. + batchW.addBatch( + simpleBatch( + totalDocs, + new ArrayBinaryColumn( + "field", storedIndexed, docIds, refs, StoredValue.Type.STRING))); + } + + // --- Doc-by-doc path --- + Directory singleDir = newDirectory(); + try (IndexWriter singleW = new IndexWriter(singleDir, newIndexWriterConfig())) { + int next = 0; + for (int d = 0; d < totalDocs; d++) { + Document doc = new Document(); + if (next < docIds.length && docIds[next] == d) { + doc.add( + new StringField("field", values[next], org.apache.lucene.document.Field.Store.YES)); + next++; + } + singleW.addDocument(doc); + } + } + + try (DirectoryReader batchR = DirectoryReader.open(batchDir); + DirectoryReader singleR = DirectoryReader.open(singleDir)) { + LeafReader batchLeaf = getOnlyLeafReader(batchR); + LeafReader singleLeaf = getOnlyLeafReader(singleR); + + assertEquals(singleLeaf.maxDoc(), batchLeaf.maxDoc()); + assertEquals(totalDocs, batchLeaf.maxDoc()); + + StoredFields batchStored = batchLeaf.storedFields(); + StoredFields singleStored = singleLeaf.storedFields(); + for (int d = 0; d < totalDocs; d++) { + IndexableField bf = batchStored.document(d).getField("field"); + IndexableField sf = singleStored.document(d).getField("field"); + if (sf == null) { + assertNull("doc " + d + " should have no stored field", bf); + } else { + assertNotNull("doc " + d + " should have a stored field", bf); + assertEquals(sf.stringValue(), bf.stringValue()); + } + } + + IndexSearcher batchSearcher = new IndexSearcher(batchR); + IndexSearcher singleSearcher = new IndexSearcher(singleR); + for (String v : values) { + Term t = new Term("field", v); + assertEquals(singleSearcher.count(new TermQuery(t)), batchSearcher.count(new TermQuery(t))); + } + } + + batchDir.close(); + singleDir.close(); + } + + /** A row column that returns an out-of-order batch doc-id must be rejected. */ + public void testRowColumnOutOfOrderDocIdThrows() throws IOException { + Directory dir = newDirectory(); + try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // docIds intentionally not non-decreasing. + int[] docIds = {2, 1}; + BytesRef[] values = {newBytesRef("a"), newBytesRef("b")}; + expectThrows( + IllegalArgumentException.class, + () -> w.addBatch(simpleBatch(3, new ArrayBinaryColumn("f", storedOnly, docIds, values)))); + } + dir.close(); + } + + /** A row column that returns a batch doc-id {@code >= numDocs} must be rejected. */ + public void testRowColumnOutOfRangeDocIdThrows() throws IOException { + Directory dir = newDirectory(); + try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // Batch size 3, but the column advertises a value at doc 5. + int[] docIds = {5}; + BytesRef[] values = {newBytesRef("oob")}; + expectThrows( + IllegalArgumentException.class, + () -> w.addBatch(simpleBatch(3, new ArrayBinaryColumn("f", storedOnly, docIds, values)))); + } + dir.close(); + } + + // --- Test Column implementations backed by arrays --- + + // ---- VectorColumn tests ---- + + public void testDenseFloatVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(3, VectorSimilarityFunction.EUCLIDEAN); + float[][] vectors = { + {1f, 2f, 3f}, {4f, 5f, 6f}, {7f, 8f, 9f}, + }; + w.addBatch(simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, vectors))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < vectors.length; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index()), 0f); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseByteVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = byteVectorType(4, VectorSimilarityFunction.EUCLIDEAN); + byte[][] vectors = { + {1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, + }; + w.addBatch(simpleBatch(3, new ArrayDenseByteVectorColumn("v", vectorType, vectors))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + ByteVectorValues values = leaf.getByteVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < vectors.length; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index())); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSparseFloatVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {0, 2, 5, 9}; + float[][] vectors = {{1f, 1f}, {2f, 2f}, {3f, 3f}, {4f, 4f}}; + // pair with a sparse long column so the batch has a defined doc count > vector count + int[] anchorIds = {0, 9}; + long[] anchorVals = {0L, 9L}; + w.addBatch( + simpleBatch( + 10, + new ArrayFloatVectorColumn("v", vectorType, docIds, vectors), + new ArrayLongColumn("anchor", NumericDocValuesField.TYPE, anchorIds, anchorVals))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < docIds.length; i++) { + assertEquals(docIds[i], it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index()), 0f); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSparseByteVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = byteVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {1, 4}; + byte[][] vectors = {{1, 2}, {3, 4}}; + int[] anchorIds = {0, 5}; + long[] anchorVals = {0L, 5L}; + w.addBatch( + simpleBatch( + 6, + new ArrayByteVectorColumn("v", vectorType, docIds, vectors), + new ArrayLongColumn("anchor", NumericDocValuesField.TYPE, anchorIds, anchorVals))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + ByteVectorValues values = leaf.getByteVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < docIds.length; i++) { + assertEquals(docIds[i], it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index())); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testVectorMixedWithLongAndBinary() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.DOT_PRODUCT); + float[][] vectors = {{0.6f, 0.8f}, {0.8f, 0.6f}, {1.0f, 0.0f}}; + long[] longs = {10, 20, 30}; + BytesRef[] bins = {newBytesRef("a"), newBytesRef("b"), newBytesRef("c")}; + int[] ids = {0, 1, 2}; + w.addBatch( + simpleBatch( + 3, + new ArrayDenseFloatVectorColumn("v", vectorType, vectors), + new ArrayLongColumn("num", NumericDocValuesField.TYPE, ids, longs), + new ArrayBinaryColumn("bin", BinaryDocValuesField.TYPE, ids, bins))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues nums = leaf.getNumericDocValues("num"); + BinaryDocValues binDv = leaf.getBinaryDocValues("bin"); + FloatVectorValues vec = leaf.getFloatVectorValues("v"); + KnnVectorValues.DocIndexIterator it = vec.iterator(); + for (int i = 0; i < 3; i++) { + assertEquals(i, nums.nextDoc()); + assertEquals(longs[i], nums.longValue()); + assertEquals(i, binDv.nextDoc()); + assertEquals(bins[i], binDv.binaryValue()); + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], vec.vectorValue(it.index()), 0f); + } + r.close(); + w.close(); + dir.close(); + } + + public void testVectorAcrossMultipleBatches() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + float[][] firstBatch = {{1f, 1f}, {2f, 2f}}; + float[][] secondBatch = {{3f, 3f}, {4f, 4f}, {5f, 5f}}; + w.addBatch(simpleBatch(2, new ArrayDenseFloatVectorColumn("v", vectorType, firstBatch))); + w.addBatch(simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, secondBatch))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + KnnVectorValues.DocIndexIterator it = values.iterator(); + float[][] all = {firstBatch[0], firstBatch[1], secondBatch[0], secondBatch[1], secondBatch[2]}; + for (int i = 0; i < all.length; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(all[i], values.vectorValue(it.index()), 0f); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testEmptyVectorColumnRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // A field type alone is not enough — every batch must have at least one column with data, + // and a vector-only column with no values is the equivalent of "no documents have this + // vector". We pair it with a long anchor to make the batch valid; the vector cursor returns + // NO_MORE_DOCS immediately. + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] anchorIds = {0, 1}; + long[] anchorVals = {0L, 1L}; + w.addBatch( + simpleBatch( + 2, + new ArrayFloatVectorColumn("v", vectorType, new int[0], new float[0][]), + new ArrayLongColumn("anchor", NumericDocValuesField.TYPE, anchorIds, anchorVals))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + if (values != null) { + assertEquals(DocIdSetIterator.NO_MORE_DOCS, values.iterator().nextDoc()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testParentFieldWithVectorBatch() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(); + config.setParentField("_parent"); + IndexWriter w = new IndexWriter(dir, config); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + float[][] vectors = {{1f, 0f}, {0f, 1f}, {1f, 1f}}; + w.addBatch(simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, vectors))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues parentDv = leaf.getNumericDocValues("_parent"); + assertNotNull(parentDv); + for (int i = 0; i < 3; i++) { + assertEquals(i, parentDv.nextDoc()); + } + FloatVectorValues values = leaf.getFloatVectorValues("v"); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < 3; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index()), 0f); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testFloatVectorEncodingMismatchFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // FieldType says FLOAT32 but column carries byte[] vectors. + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + byte[][] vectors = {{1, 2}, {3, 4}}; + expectThrows( + ClassCastException.class, + () -> w.addBatch(simpleBatch(2, new ArrayDenseByteVectorColumn("v", vectorType, vectors)))); + w.rollback(); + dir.close(); + } + + public void testWrongDimensionFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(3, VectorSimilarityFunction.EUCLIDEAN); + float[][] vectors = {{1f, 2f, 3f}, {4f, 5f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(2, new ArrayDenseFloatVectorColumn("v", vectorType, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("expected dimension 3")); + w.rollback(); + dir.close(); + } + + public void testZeroDimensionFieldTypeFails() { + FieldType bad = new FieldType(); + // No vector attributes set -> vectorDimension() == 0 + bad.setDocValuesType(DocValuesType.NUMERIC); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})); + assertTrue(e.getMessage(), e.getMessage().contains("vectorDimension() > 0")); + } + + public void testVectorWithDocValuesRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setDocValuesType(DocValuesType.NUMERIC); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testVectorWithStoredRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setStored(true); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testVectorWithIndexOptionsRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setIndexOptions(IndexOptions.DOCS); + bad.setTokenized(false); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testVectorWithPointsRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setDimensions(1, Integer.BYTES); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testDuplicateDocIDFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {0, 0}; + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(2, new ArrayFloatVectorColumn("v", vectorType, docIds, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("strictly increasing")); + w.rollback(); + dir.close(); + } + + public void testDecreasingDocIDFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {3, 1}; + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(4, new ArrayFloatVectorColumn("v", vectorType, docIds, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("strictly increasing")); + w.rollback(); + dir.close(); + } + + public void testVectorOutOfRangeDocIDFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {0, 5}; + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(3, new ArrayFloatVectorColumn("v", vectorType, docIds, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("out of range")); + w.rollback(); + dir.close(); + } + + public void testDenseVectorColumnTooFewValuesFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + // 2 values declared DENSE but the batch has 3 docs. + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("Dense column")); + w.rollback(); + dir.close(); + } + + public void testVectorColumnSchemaConsistencyAcrossBatches() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType float32Type = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", float32Type, new float[][] {{1f, 2f}}))); + + FieldType byteType = byteVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseByteVectorColumn("v", byteType, new byte[][] {{1, 2}})))); + w.rollback(); + dir.close(); + } + + private static FieldType floatVectorType(int dimension, VectorSimilarityFunction sim) { + FieldType type = new FieldType(); + type.setVectorAttributes(dimension, VectorEncoding.FLOAT32, sim); + type.freeze(); + return type; + } + + private static FieldType byteVectorType(int dimension, VectorSimilarityFunction sim) { + FieldType type = new FieldType(); + type.setVectorAttributes(dimension, VectorEncoding.BYTE, sim); + type.freeze(); + return type; + } + + private static ColumnBatch simpleBatch(int numDocs, Column... columns) { + return new ColumnBatch() { + @Override + public int numDocs() { + return numDocs; + } + + @Override + public Iterable<Column> columns() { + return List.of(columns); + } + }; + } + + private static class ArrayLongColumn extends LongColumn { Review Comment: I had sent an email to the Lucene mailing list about this. I had mentioned that at some point Lucene might want to add ergonomic builders (similar to IntField, DoubleField, etc) vs just low level apis (similar to how users can directly override Field to encode points / dv building). I didn't really plan to make them in this PR as I was worried about the surface area and/or quickly adding implementations to apis which are still under flux. But I can certainly had them if there is specific interest. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
