easyice commented on PR #12842: URL: https://github.com/apache/lucene/pull/12842#issuecomment-1976298566
Hi Adrien, for simplicity, I changed the logic to use group-varint for when positions only (no offsets and no payloads). In addition, since `writeGroupVInt` is slower than `writeVInt`, we don't see the improve for `indexWriter.flush`, it may even be a bit slower. I wrote a new ugly JMH benchmark for `addPositions/nextPositions`: ``` PR benchmark_nextPositions thrpt 5 0.448 ± 0.042 ops/us main benchmark_nextPositions thrpt 5 0.646 ± 0.045 ops/us ``` <details> <summary >Code</summary> ```java @BenchmarkMode(Mode.Throughput) @OutputTimeUnit(TimeUnit.MICROSECONDS) @State(Scope.Benchmark) @Warmup(iterations = 3, time = 3) @Measurement(iterations = 5, time = 5) @Fork( value = 1, jvmArgsPrepend = {"--add-modules=jdk.unsupported"}) public class PosGroupVIntBenchmark { Directory dir; private Random rand = new Random(0); PostingsEnum reuse; TermsEnum termsEnum; // copy from TestUtil private String randomSimpleString(Random r, int minLength, int maxLength) { final int end = r.nextInt(minLength, maxLength); if (end == 0) { // allow 0 length return ""; } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { buffer[i] = (char) r.nextInt('a', 'z'); } return new String(buffer, 0, end); } private String randomString(int termsPerField, int freqPerTerm) { List<String> values = new ArrayList<>(); for (int i = 0; i < termsPerField; ) { String s = randomSimpleString(rand, 5, 10); for (int j = 0; j < freqPerTerm; j++) { values.add(s); } i += freqPerTerm; } Collections.shuffle(values, rand); String text = String.join(" ", values); return text; } private List<String> randomStrings(int size, int termsPerField, int freqPerTerm) { List<String> values = new ArrayList<>(); for (int i = 0; i < size; i++) { values.add(randomString(termsPerField, freqPerTerm)); } return values; } @Setup(Level.Trial) public void init() throws Exception { dir = new ByteBuffersDirectory(); List<String> terms = randomStrings(10, 200, 100); IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); Sort indexSort = new Sort(new SortField("sort", SortField.Type.LONG)); config.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); IndexWriter w = new IndexWriter(dir, config); for (int i = 0; i < terms.size(); ++i) { Document doc = new Document(); doc.add(new NumericDocValuesField("sort", rand.nextInt())); doc.add(new TextField("field", terms.get(i), Field.Store.NO)); w.addDocument(doc); } w.commit(); IndexReader r = DirectoryReader.open(w); CodecReader cr = (CodecReader) r.leaves().get(0).reader(); CodecReader wrap = SortingCodecReader.wrap(cr, indexSort); Terms fieldTerms = wrap.getPostingsReader().terms("field"); termsEnum = fieldTerms.iterator(); termsEnum.next(); reuse = termsEnum.postings(null, PostingsEnum.POSITIONS); w.close(); } @Benchmark public void benchmark_addPositions() throws Exception { termsEnum.postings(reuse, PostingsEnum.POSITIONS); } @Benchmark public void benchmark_nextPositions() throws Exception { termsEnum.postings(reuse, PostingsEnum.POSITIONS); reuse.nextDoc(); int freq = reuse.freq(); for (int i = 0; i < freq; i++) { reuse.nextPosition(); } } } ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org