siddharthteotia commented on a change in pull request #5240:
URL: https://github.com/apache/incubator-pinot/pull/5240#discussion_r419567173



##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and
+ *     serialize them into a file.
+ *   </li>
+ * </ul>
+ * <p>Based on the number of values we need to store, we use direct memory or 
MMap file to allocate the buffer.
+ */
+public final class RangeIndexCreator implements InvertedIndexCreator {
+
+  private static final int RANGE_INDEX_VERSION = 1;
+
+  // Use MMapBuffer if the value buffer size is larger than 2G
+  private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 500_000_000;
+  private static final int DEFAULT_NUM_RANGES = 20;
+
+  private static final String FORWARD_INDEX_VALUE_BUFFER_SUFFIX = 
".fwd.idx.val.buf";
+  private static final String DOC_ID_VALUE_BUFFER_SUFFIX = ".doc.id.val.buf";
+
+  private final File _invertedIndexFile;
+  private final File _forwardIndexValueBufferFile;
+  private final File _docIdBufferFileForSorting;
+  private final int _numValues;
+  private final boolean _useMMapBuffer;
+
+  // Forward index buffers (from docId to dictId)
+  private int _nextDocId;
+  private PinotDataBuffer _forwardIndexValueBuffer;
+  private NumberValueBuffer _valueBuffer;
+  //for sorting
+  private PinotDataBuffer _docIdValueBuffer;
+  private IntValueBuffer _docIdBuffer;
+  // For multi-valued column only because each docId can have multiple dictIds
+  private int _nextValueId;
+  private int _numDocsPerRange;
+  private FieldSpec.DataType _valueType;
+
+  public RangeIndexCreator(File indexDir, FieldSpec fieldSpec, 
FieldSpec.DataType valueType, int numRanges, int numDocsPerRange, int numDocs, 
int numValues)
+      throws IOException {
+    _valueType = valueType;
+    String columnName = fieldSpec.getName();
+    _invertedIndexFile = new File(indexDir, columnName + 
BITMAP_RANGE_INDEX_FILE_EXTENSION);
+    _forwardIndexValueBufferFile = new File(indexDir, columnName + 
FORWARD_INDEX_VALUE_BUFFER_SUFFIX);
+    _docIdBufferFileForSorting = new File(indexDir, columnName + 
DOC_ID_VALUE_BUFFER_SUFFIX);
+    _numValues = fieldSpec.isSingleValueField() ? numDocs : numValues;
+    _useMMapBuffer = _numValues > NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER;
+    int valueSize = valueType.size();
+    try {
+      //use DEFAULT_NUM_RANGES if numRanges is not set
+      if(numRanges < 0) {
+        numRanges = DEFAULT_NUM_RANGES;
+      }
+      if(numDocsPerRange < 0) {
+        _numDocsPerRange = (int) Math.ceil(_numValues / numRanges);
+      }
+
+      //FORWARD INDEX - stores the actual values added
+      _forwardIndexValueBuffer = createTempBuffer((long) _numValues * 
valueSize, _forwardIndexValueBufferFile);
+
+      switch (_valueType) {
+        case INT:
+          _valueBuffer = new IntValueBuffer(_forwardIndexValueBuffer);
+          break;
+        case FLOAT:
+          _valueBuffer = new FloatValueBuffer(_forwardIndexValueBuffer);
+          break;
+        case LONG:
+          _valueBuffer = new LongValueBuffer(_forwardIndexValueBuffer);
+          break;
+        case DOUBLE:
+          _valueBuffer = new DoubleValueBuffer(_forwardIndexValueBuffer);
+          break;
+        default:
+          throw new UnsupportedOperationException("Range index is not 
supported for columns of data type:" + valueType);
+      }
+
+      //Stores the docId - this will be sorted based on the values in FORWARD 
INDEX in the end
+      _docIdValueBuffer = createTempBuffer((long) _numValues * Integer.BYTES, 
_forwardIndexValueBufferFile);
+      _docIdBuffer = new IntValueBuffer(_docIdValueBuffer);
+    } catch (Exception e) {
+      destroyBuffer(_forwardIndexValueBuffer, _forwardIndexValueBufferFile);
+      destroyBuffer(_forwardIndexValueBuffer, _docIdBufferFileForSorting);
+      throw e;
+    }
+  }
+
+  @Override
+  public void add(int dictId) {
+    _valueBuffer.put(_nextDocId, dictId);
+    _docIdBuffer.put(_nextDocId, _nextDocId);
+    _nextDocId = _nextDocId + 1;
+  }
+
+  @Override
+  public void add(int[] dictIds, int length) {
+    for (int i = 0; i < length; i++) {
+      int dictId = dictIds[i];
+      _valueBuffer.put(_nextDocId, dictId);
+      _docIdBuffer.put(_nextValueId, _nextDocId);
+      _nextValueId = _nextValueId + 1;
+    }
+  }
+
+  @Override
+  public void addDoc(Object document, int docIdCounter) {
+    throw new IllegalStateException("Bitmap inverted index creator does not 
support Object type currently");
+  }
+
+  @Override
+  public void seal()
+      throws IOException {
+    //sort the forward index copy
+    //go over the sorted value to compute ranges
+    IntComparator comparator = (i, j) -> {

Review comment:
       So this is either going to sort raw values or dictIds. Correct?

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and
+ *     serialize them into a file.
+ *   </li>
+ * </ul>
+ * <p>Based on the number of values we need to store, we use direct memory or 
MMap file to allocate the buffer.
+ */
+public final class RangeIndexCreator implements InvertedIndexCreator {
+
+  private static final int RANGE_INDEX_VERSION = 1;
+
+  // Use MMapBuffer if the value buffer size is larger than 2G
+  private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 500_000_000;
+  private static final int DEFAULT_NUM_RANGES = 20;
+
+  private static final String FORWARD_INDEX_VALUE_BUFFER_SUFFIX = 
".fwd.idx.val.buf";
+  private static final String DOC_ID_VALUE_BUFFER_SUFFIX = ".doc.id.val.buf";
+
+  private final File _invertedIndexFile;
+  private final File _forwardIndexValueBufferFile;
+  private final File _docIdBufferFileForSorting;
+  private final int _numValues;
+  private final boolean _useMMapBuffer;
+
+  // Forward index buffers (from docId to dictId)
+  private int _nextDocId;
+  private PinotDataBuffer _forwardIndexValueBuffer;
+  private NumberValueBuffer _valueBuffer;
+  //for sorting
+  private PinotDataBuffer _docIdValueBuffer;
+  private IntValueBuffer _docIdBuffer;
+  // For multi-valued column only because each docId can have multiple dictIds
+  private int _nextValueId;
+  private int _numDocsPerRange;
+  private FieldSpec.DataType _valueType;
+
+  public RangeIndexCreator(File indexDir, FieldSpec fieldSpec, 
FieldSpec.DataType valueType, int numRanges, int numDocsPerRange, int numDocs, 
int numValues)
+      throws IOException {
+    _valueType = valueType;
+    String columnName = fieldSpec.getName();
+    _invertedIndexFile = new File(indexDir, columnName + 
BITMAP_RANGE_INDEX_FILE_EXTENSION);
+    _forwardIndexValueBufferFile = new File(indexDir, columnName + 
FORWARD_INDEX_VALUE_BUFFER_SUFFIX);
+    _docIdBufferFileForSorting = new File(indexDir, columnName + 
DOC_ID_VALUE_BUFFER_SUFFIX);
+    _numValues = fieldSpec.isSingleValueField() ? numDocs : numValues;
+    _useMMapBuffer = _numValues > NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER;
+    int valueSize = valueType.size();
+    try {
+      //use DEFAULT_NUM_RANGES if numRanges is not set
+      if(numRanges < 0) {
+        numRanges = DEFAULT_NUM_RANGES;
+      }
+      if(numDocsPerRange < 0) {
+        _numDocsPerRange = (int) Math.ceil(_numValues / numRanges);
+      }
+
+      //FORWARD INDEX - stores the actual values added
+      _forwardIndexValueBuffer = createTempBuffer((long) _numValues * 
valueSize, _forwardIndexValueBufferFile);
+
+      switch (_valueType) {
+        case INT:
+          _valueBuffer = new IntValueBuffer(_forwardIndexValueBuffer);
+          break;
+        case FLOAT:
+          _valueBuffer = new FloatValueBuffer(_forwardIndexValueBuffer);
+          break;
+        case LONG:
+          _valueBuffer = new LongValueBuffer(_forwardIndexValueBuffer);
+          break;
+        case DOUBLE:
+          _valueBuffer = new DoubleValueBuffer(_forwardIndexValueBuffer);
+          break;
+        default:
+          throw new UnsupportedOperationException("Range index is not 
supported for columns of data type:" + valueType);
+      }
+
+      //Stores the docId - this will be sorted based on the values in FORWARD 
INDEX in the end
+      _docIdValueBuffer = createTempBuffer((long) _numValues * Integer.BYTES, 
_forwardIndexValueBufferFile);
+      _docIdBuffer = new IntValueBuffer(_docIdValueBuffer);
+    } catch (Exception e) {
+      destroyBuffer(_forwardIndexValueBuffer, _forwardIndexValueBufferFile);
+      destroyBuffer(_forwardIndexValueBuffer, _docIdBufferFileForSorting);
+      throw e;
+    }
+  }
+
+  @Override
+  public void add(int dictId) {
+    _valueBuffer.put(_nextDocId, dictId);
+    _docIdBuffer.put(_nextDocId, _nextDocId);
+    _nextDocId = _nextDocId + 1;
+  }
+
+  @Override
+  public void add(int[] dictIds, int length) {
+    for (int i = 0; i < length; i++) {
+      int dictId = dictIds[i];
+      _valueBuffer.put(_nextDocId, dictId);
+      _docIdBuffer.put(_nextValueId, _nextDocId);
+      _nextValueId = _nextValueId + 1;
+    }
+  }
+
+  @Override
+  public void addDoc(Object document, int docIdCounter) {
+    throw new IllegalStateException("Bitmap inverted index creator does not 
support Object type currently");

Review comment:
       Change the exception
   
   

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and
+ *     serialize them into a file.
+ *   </li>
+ * </ul>
+ * <p>Based on the number of values we need to store, we use direct memory or 
MMap file to allocate the buffer.
+ */
+public final class RangeIndexCreator implements InvertedIndexCreator {
+
+  private static final int RANGE_INDEX_VERSION = 1;
+
+  // Use MMapBuffer if the value buffer size is larger than 2G
+  private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 500_000_000;
+  private static final int DEFAULT_NUM_RANGES = 20;
+
+  private static final String FORWARD_INDEX_VALUE_BUFFER_SUFFIX = 
".fwd.idx.val.buf";
+  private static final String DOC_ID_VALUE_BUFFER_SUFFIX = ".doc.id.val.buf";
+
+  private final File _invertedIndexFile;
+  private final File _forwardIndexValueBufferFile;
+  private final File _docIdBufferFileForSorting;
+  private final int _numValues;
+  private final boolean _useMMapBuffer;
+
+  // Forward index buffers (from docId to dictId)
+  private int _nextDocId;
+  private PinotDataBuffer _forwardIndexValueBuffer;
+  private NumberValueBuffer _valueBuffer;

Review comment:
       This buffer will either store dictIds or raw values. Correct?

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and
+ *     serialize them into a file.
+ *   </li>
+ * </ul>
+ * <p>Based on the number of values we need to store, we use direct memory or 
MMap file to allocate the buffer.
+ */
+public final class RangeIndexCreator implements InvertedIndexCreator {
+
+  private static final int RANGE_INDEX_VERSION = 1;
+
+  // Use MMapBuffer if the value buffer size is larger than 2G
+  private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 500_000_000;
+  private static final int DEFAULT_NUM_RANGES = 20;
+
+  private static final String FORWARD_INDEX_VALUE_BUFFER_SUFFIX = 
".fwd.idx.val.buf";
+  private static final String DOC_ID_VALUE_BUFFER_SUFFIX = ".doc.id.val.buf";
+
+  private final File _invertedIndexFile;
+  private final File _forwardIndexValueBufferFile;
+  private final File _docIdBufferFileForSorting;
+  private final int _numValues;
+  private final boolean _useMMapBuffer;
+
+  // Forward index buffers (from docId to dictId)
+  private int _nextDocId;
+  private PinotDataBuffer _forwardIndexValueBuffer;

Review comment:
       Can we add some short comments (one line is enough) about each buffer 
type?

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and
+ *     serialize them into a file.
+ *   </li>
+ * </ul>
+ * <p>Based on the number of values we need to store, we use direct memory or 
MMap file to allocate the buffer.
+ */
+public final class RangeIndexCreator implements InvertedIndexCreator {
+
+  private static final int RANGE_INDEX_VERSION = 1;
+
+  // Use MMapBuffer if the value buffer size is larger than 2G
+  private static final int NUM_VALUES_THRESHOLD_FOR_MMAP_BUFFER = 500_000_000;
+  private static final int DEFAULT_NUM_RANGES = 20;
+
+  private static final String FORWARD_INDEX_VALUE_BUFFER_SUFFIX = 
".fwd.idx.val.buf";
+  private static final String DOC_ID_VALUE_BUFFER_SUFFIX = ".doc.id.val.buf";
+
+  private final File _invertedIndexFile;
+  private final File _forwardIndexValueBufferFile;
+  private final File _docIdBufferFileForSorting;
+  private final int _numValues;
+  private final boolean _useMMapBuffer;
+
+  // Forward index buffers (from docId to dictId)

Review comment:
       Why are we creating the forward index for the column as part of creating 
range index? The loop in SegmentColumnarIndexCreator that goes over each 
GenericRow takes care of that.

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and
+ *     serialize them into a file.
+ *   </li>
+ * </ul>
+ * <p>Based on the number of values we need to store, we use direct memory or 
MMap file to allocate the buffer.
+ */
+public final class RangeIndexCreator implements InvertedIndexCreator {

Review comment:
       I think the problem with treating this as another inverted index by 
overriding InvertedIndexCreator and InvertedIndexReader interfaces, is that we 
are inherently making this work only on dictionary encoded columns because 
these interfaces and their APIs are written to work only on dictIds
   
   However, for text index (which also implements the same interface), we added 
a docId based API but that takes an object value. May be we can expand that API 
for each primitive type and that way we can support raw value based inverted 
index as well. I think we should decide on this sooner because this question 
will pop up for every new index type that we may add.
   
   Another alternative would be to change the inheritance hierarchy by 
introducing a mid layer of interfaces -- DictionaryBasedInvertedIndexCreator, 
RawValueBasedInvertedIndexCreator. Move all the dictId based APIs to former and 
raw value based to latter

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index
+ *     value buffer (for multi-valued column also store number of values for 
each docId into forward index length
+ *     buffer). We also compute the inverted index length for each dictId 
while adding values.
+ *   </li>
+ *   <li>
+ *     In the second pass (processing values phase), when seal() method is 
called, all the dictIds should already been
+ *     added. We first reorder the values into the inverted index buffers by 
going over the dictIds in forward index
+ *     value buffer (for multi-valued column we also need forward index length 
buffer to get the docId for each dictId).
+ *     <p>Once we have the inverted index buffers, we simply go over them and 
create the bitmap for each dictId and

Review comment:
       This doesn't sound right. We don't create a bitmap for each dictId. I 
believe there is a bitmap for a contiguous range of dictIds

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/RangeIndexCreator.java
##########
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.creator.impl.inv;
+
+import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.Swapper;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.core.common.Constants;
+import org.apache.pinot.core.query.utils.Pair;
+import org.apache.pinot.core.segment.creator.InvertedIndexCreator;
+import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static 
org.apache.pinot.core.segment.creator.impl.V1Constants.Indexes.BITMAP_RANGE_INDEX_FILE_EXTENSION;
+
+
+/**
+ * Implementation of {@link InvertedIndexCreator} that uses off-heap memory.
+ * <p>We use 2 passes to create the range index.
+ * <ul>
+ *
+ *   <li>
+ *     A
+ *   </li>
+ *   <li>
+ *     In the first pass (adding values phase), when add() method is called, 
store the raw values into the forward index

Review comment:
       You mean store the dictionary Ids?

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluator.java
##########
@@ -22,6 +22,8 @@
 
 
 public interface PredicateEvaluator {
+

Review comment:
       remove?

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/operator/filter/RangeFilterOperator.java
##########
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.operator.filter;
+
+import com.google.common.base.Preconditions;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.pinot.core.common.BlockDocIdIterator;
+import org.apache.pinot.core.common.DataSource;
+import org.apache.pinot.core.common.Predicate;
+import org.apache.pinot.core.operator.blocks.FilterBlock;
+import org.apache.pinot.core.operator.dociditerators.MVScanDocIdIterator;
+import org.apache.pinot.core.operator.dociditerators.SVScanDocIdIterator;
+import org.apache.pinot.core.operator.docidsets.BitmapDocIdSet;
+import org.apache.pinot.core.operator.docidsets.FilterBlockDocIdSet;
+import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
+import 
org.apache.pinot.core.operator.filter.predicate.RangePredicateEvaluatorFactory;
+import org.apache.pinot.core.segment.index.readers.RangeIndexReader;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+
+public class RangeFilterOperator extends BaseFilterOperator {
+  private static final String OPERATOR_NAME = "RangeFilterOperator";
+
+  private final PredicateEvaluator _predicateEvaluator;
+  private final DataSource _dataSource;
+  private final int _startDocId;
+  private final int _endDocId;
+  private final boolean _exclusive;
+
+  public RangeFilterOperator(PredicateEvaluator predicateEvaluator, DataSource 
dataSource, int startDocId,
+      int endDocId) {
+    // NOTE:
+    // Predicate that is always evaluated as true or false should not be 
passed into the BitmapBasedFilterOperator for
+    // performance concern.
+    // If predicate is always evaluated as true, use MatchAllFilterOperator; 
if predicate is always evaluated as false,
+    // use EmptyFilterOperator.
+    Preconditions.checkArgument(!predicateEvaluator.isAlwaysTrue() && 
!predicateEvaluator.isAlwaysFalse());
+
+    _predicateEvaluator = predicateEvaluator;
+    _dataSource = dataSource;
+    _startDocId = startDocId;
+    _endDocId = endDocId;
+    _exclusive = predicateEvaluator.isExclusive();
+  }
+
+  @Override
+  protected FilterBlock getNextBlock() {
+
+    //only dictionary based is supported for now

Review comment:
       Let's add a TODO for supporting this on raw columns

##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/operator/filter/RangeFilterOperator.java
##########
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.operator.filter;
+
+import com.google.common.base.Preconditions;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.pinot.core.common.BlockDocIdIterator;
+import org.apache.pinot.core.common.DataSource;
+import org.apache.pinot.core.common.Predicate;
+import org.apache.pinot.core.operator.blocks.FilterBlock;
+import org.apache.pinot.core.operator.dociditerators.MVScanDocIdIterator;
+import org.apache.pinot.core.operator.dociditerators.SVScanDocIdIterator;
+import org.apache.pinot.core.operator.docidsets.BitmapDocIdSet;
+import org.apache.pinot.core.operator.docidsets.FilterBlockDocIdSet;
+import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
+import 
org.apache.pinot.core.operator.filter.predicate.RangePredicateEvaluatorFactory;
+import org.apache.pinot.core.segment.index.readers.RangeIndexReader;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+
+public class RangeFilterOperator extends BaseFilterOperator {

Review comment:
       please add javadoc




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to