[incubator-pinot] branch master updated: Use 8byte offsets in chunk based raw index creator (#5285)

siddteotia Thu, 23 Apr 2020 09:50:20 -0700

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new 410fd70  Use 8byte offsets in chunk based raw index creator (#5285)
410fd70 is described below

commit 410fd708508cff3ea33ca2567a296fa62b5bfecd
Author: Sidd <[email protected]>
AuthorDate: Thu Apr 23 09:49:35 2020 -0700

    Use 8byte offsets in chunk based raw index creator (#5285)
    
    * Use 8byte offsets in chunk based raw index
    creator
    
    * cleanup
    
    * fixed tests
    
    * Fix tests and address review comments
    
    * Use 8-byte offset for fixed-byte chunk writer.
    Add backward compatibility test
    
    Co-authored-by: Siddharth Teotia <[email protected]>
---
 .../reader/impl/v1/BaseChunkSingleValueReader.java |  27 +++++---
 .../impl/v1/VarByteChunkSingleValueReader.java     |   6 +-
 .../writer/impl/v1/BaseChunkSingleValueWriter.java |  30 +++++++--
 .../impl/v1/FixedByteChunkSingleValueWriter.java   |   5 +-
 .../impl/v1/VarByteChunkSingleValueWriter.java     |   8 ++-
 .../segment/memory/PinotNativeOrderLBuffer.java    |   2 +-
 .../segment/memory/PinotNonNativeOrderLBuffer.java |   2 +-
 .../FixedByteChunkSingleValueReaderWriteTest.java  |  25 +++++---
 .../VarByteChunkSingleValueReaderWriteTest.java    |  68 ++++++++++++++++-----
 .../src/test/resources/data/fixedByteCompressed.v2 | Bin 0 -> 8098 bytes
 pinot-core/src/test/resources/data/fixedByteRaw.v2 | Bin 0 -> 16036 bytes
 .../resources/data/varByteStringsCompressed.v2     | Bin 0 -> 17355 bytes
 .../src/test/resources/data/varByteStringsRaw.v2   | Bin 0 -> 286902 bytes
 13 files changed, 127 insertions(+), 46 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/BaseChunkSingleValueReader.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/BaseChunkSingleValueReader.java
index 6cd86c1..24972ad 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/BaseChunkSingleValueReader.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/BaseChunkSingleValueReader.java
@@ -18,12 +18,15 @@
  */
 package org.apache.pinot.core.io.reader.impl.v1;
 
+import com.google.common.base.Preconditions;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import org.apache.pinot.core.io.compression.ChunkCompressorFactory;
 import org.apache.pinot.core.io.compression.ChunkDecompressor;
 import org.apache.pinot.core.io.reader.BaseSingleColumnSingleValueReader;
 import org.apache.pinot.core.io.reader.impl.ChunkReaderContext;
+import org.apache.pinot.core.io.writer.impl.v1.BaseChunkSingleValueWriter;
+import org.apache.pinot.core.io.writer.impl.v1.VarByteChunkSingleValueWriter;
 import org.apache.pinot.core.segment.memory.PinotDataBuffer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -47,6 +50,8 @@ public abstract class BaseChunkSingleValueReader extends 
BaseSingleColumnSingleV
   protected final int _numDocsPerChunk;
   protected final int _numChunks;
   protected final int _lengthOfLongestEntry;
+  private final int _version;
+  private final int _headerEntryChunkOffsetSize;
 
   /**
    * Constructor for the class.
@@ -57,7 +62,7 @@ public abstract class BaseChunkSingleValueReader extends 
BaseSingleColumnSingleV
     _dataBuffer = pinotDataBuffer;
 
     int headerOffset = 0;
-    int version = _dataBuffer.getInt(headerOffset);
+    _version = _dataBuffer.getInt(headerOffset);
     headerOffset += Integer.BYTES;
 
     _numChunks = _dataBuffer.getInt(headerOffset);
@@ -70,7 +75,7 @@ public abstract class BaseChunkSingleValueReader extends 
BaseSingleColumnSingleV
     headerOffset += Integer.BYTES;
 
     int dataHeaderStart = headerOffset;
-    if (version > 1) {
+    if (_version > 1) {
       _dataBuffer.getInt(headerOffset); // Total docs
       headerOffset += Integer.BYTES;
 
@@ -87,9 +92,10 @@ public abstract class BaseChunkSingleValueReader extends 
BaseSingleColumnSingleV
     }
 
     _chunkSize = (_lengthOfLongestEntry * _numDocsPerChunk);
+    _headerEntryChunkOffsetSize = 
BaseChunkSingleValueWriter.getHeaderEntryChunkOffsetSize(_version);
 
     // Slice out the header from the data buffer.
-    int dataHeaderLength = _numChunks * Integer.BYTES;
+    int dataHeaderLength = _numChunks * _headerEntryChunkOffsetSize;
     int rawDataStart = dataHeaderStart + dataHeaderLength;
     _dataHeader = _dataBuffer.view(dataHeaderStart, rawDataStart);
 
@@ -120,14 +126,14 @@ public abstract class BaseChunkSingleValueReader extends 
BaseSingleColumnSingleV
     }
 
     int chunkSize;
-    int chunkPosition = getChunkPosition(chunkId);
+    long chunkPosition = getChunkPosition(chunkId);
 
     // Size of chunk can be determined using next chunks offset, or end of 
data buffer for last chunk.
     if (chunkId == (_numChunks - 1)) { // Last chunk.
       chunkSize = (int) (_dataBuffer.size() - chunkPosition);
     } else {
-      int nextChunkOffset = getChunkPosition(chunkId + 1);
-      chunkSize = nextChunkOffset - chunkPosition;
+      long nextChunkOffset = getChunkPosition(chunkId + 1);
+      chunkSize = (int)(nextChunkOffset - chunkPosition);
     }
 
     ByteBuffer decompressedBuffer = context.getChunkBuffer();
@@ -145,12 +151,15 @@ public abstract class BaseChunkSingleValueReader extends 
BaseSingleColumnSingleV
 
   /**
    * Helper method to get the offset of the chunk in the data.
-   *
    * @param chunkId Id of the chunk for which to return the position.
    * @return Position (offset) of the chunk in the data.
    */
-  protected int getChunkPosition(int chunkId) {
-    return _dataHeader.getInt(chunkId * Integer.BYTES);
+  protected long getChunkPosition(int chunkId) {
+    if (_headerEntryChunkOffsetSize == Integer.BYTES) {
+      return _dataHeader.getInt(chunkId * _headerEntryChunkOffsetSize);
+    } else {
+      return _dataHeader.getLong(chunkId * _headerEntryChunkOffsetSize);
+    }
   }
 
   /**
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/VarByteChunkSingleValueReader.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/VarByteChunkSingleValueReader.java
index 82fcb2b..418a0f3 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/VarByteChunkSingleValueReader.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/VarByteChunkSingleValueReader.java
@@ -55,7 +55,7 @@ public class VarByteChunkSingleValueReader extends 
BaseChunkSingleValueReader {
     int chunkRowId = row % _numDocsPerChunk;
     ByteBuffer chunkBuffer = getChunkForRow(row, context);
 
-    int rowOffset = chunkBuffer.getInt(chunkRowId * Integer.BYTES);
+    int rowOffset = chunkBuffer.getInt(chunkRowId * 
VarByteChunkSingleValueWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE);
     int nextRowOffset = getNextRowOffset(chunkRowId, chunkBuffer);
 
     int length = nextRowOffset - rowOffset;
@@ -77,7 +77,7 @@ public class VarByteChunkSingleValueReader extends 
BaseChunkSingleValueReader {
     int chunkRowId = row % _numDocsPerChunk;
     ByteBuffer chunkBuffer = getChunkForRow(row, context);
 
-    int rowOffset = chunkBuffer.getInt(chunkRowId * Integer.BYTES);
+    int rowOffset = chunkBuffer.getInt(chunkRowId * 
VarByteChunkSingleValueWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE);
     int nextRowOffset = getNextRowOffset(chunkRowId, chunkBuffer);
 
     int length = nextRowOffset - rowOffset;
@@ -109,7 +109,7 @@ public class VarByteChunkSingleValueReader extends 
BaseChunkSingleValueReader {
       // Last row in this trunk.
       nextRowOffset = chunkBuffer.limit();
     } else {
-      nextRowOffset = chunkBuffer.getInt((currentRowId + 1) * Integer.BYTES);
+      nextRowOffset = chunkBuffer.getInt((currentRowId + 1) * 
VarByteChunkSingleValueWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE);
       // For incomplete chunks, the next string's offset will be 0 as row 
offset for absent rows are 0.
       if (nextRowOffset == 0) {
         nextRowOffset = chunkBuffer.limit();
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/BaseChunkSingleValueWriter.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/BaseChunkSingleValueWriter.java
index 46399b7..597bd1f 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/BaseChunkSingleValueWriter.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/BaseChunkSingleValueWriter.java
@@ -18,6 +18,7 @@
  */
 package org.apache.pinot.core.io.writer.impl.v1;
 
+import com.google.common.base.Preconditions;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
@@ -37,6 +38,8 @@ import org.slf4j.LoggerFactory;
  */
 public abstract class BaseChunkSingleValueWriter implements 
SingleColumnSingleValueWriter {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(BaseChunkSingleValueWriter.class);
+  private static final int FILE_HEADER_ENTRY_CHUNK_OFFSET_SIZE_V1V2 = 
Integer.BYTES;
+  private static final int FILE_HEADER_ENTRY_CHUNK_OFFSET_SIZE_V3 = Long.BYTES;
 
   protected final FileChannel _dataFile;
   protected ByteBuffer _header;
@@ -45,7 +48,9 @@ public abstract class BaseChunkSingleValueWriter implements 
SingleColumnSingleVa
   protected final ChunkCompressor _chunkCompressor;
 
   protected int _chunkSize;
-  protected int _dataOffset;
+  protected long _dataOffset;
+
+  private final int _headerEntryChunkOffsetSize;
 
   /**
    * Constructor for the class.
@@ -64,13 +69,25 @@ public abstract class BaseChunkSingleValueWriter implements 
SingleColumnSingleVa
       throws FileNotFoundException {
     _chunkSize = chunkSize;
     _chunkCompressor = ChunkCompressorFactory.getCompressor(compressionType);
-
+    _headerEntryChunkOffsetSize = getHeaderEntryChunkOffsetSize(version);
     _dataOffset = writeHeader(compressionType, totalDocs, numDocsPerChunk, 
sizeOfEntry, version);
     _chunkBuffer = ByteBuffer.allocateDirect(chunkSize);
     _compressedBuffer = ByteBuffer.allocateDirect(chunkSize * 2);
     _dataFile = new RandomAccessFile(file, "rw").getChannel();
   }
 
+  public static int getHeaderEntryChunkOffsetSize(int version) {
+    switch (version) {
+      case 1:
+      case 2:
+        return FILE_HEADER_ENTRY_CHUNK_OFFSET_SIZE_V1V2;
+      case 3:
+        return FILE_HEADER_ENTRY_CHUNK_OFFSET_SIZE_V3;
+      default:
+        throw new IllegalStateException("Invalid version: " + version);
+    }
+  }
+
   @Override
   public void setChar(int row, char ch) {
     throw new UnsupportedOperationException();
@@ -139,7 +156,7 @@ public abstract class BaseChunkSingleValueWriter implements 
SingleColumnSingleVa
   private int writeHeader(ChunkCompressorFactory.CompressionType 
compressionType, int totalDocs, int numDocsPerChunk,
       int sizeOfEntry, int version) {
     int numChunks = (totalDocs + numDocsPerChunk - 1) / numDocsPerChunk;
-    int headerSize = (numChunks + 7) * Integer.BYTES; // 7 items written 
before chunk indexing.
+    int headerSize = (7 * Integer.BYTES) + (numChunks * 
_headerEntryChunkOffsetSize);
 
     _header = ByteBuffer.allocateDirect(headerSize);
 
@@ -196,7 +213,12 @@ public abstract class BaseChunkSingleValueWriter 
implements SingleColumnSingleVa
       throw new RuntimeException(e);
     }
 
-    _header.putInt(_dataOffset);
+    if (_headerEntryChunkOffsetSize == Integer.BYTES) {
+      _header.putInt((int)_dataOffset);
+    } else if (_headerEntryChunkOffsetSize == Long.BYTES) {
+      _header.putLong(_dataOffset);
+    }
+
     _dataOffset += sizeToWrite;
 
     _chunkBuffer.clear();
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/FixedByteChunkSingleValueWriter.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/FixedByteChunkSingleValueWriter.java
index 4894591..2457e88 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/FixedByteChunkSingleValueWriter.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/FixedByteChunkSingleValueWriter.java
@@ -40,7 +40,8 @@ import 
org.apache.pinot.core.io.compression.ChunkCompressorFactory;
  *   <li> Integer: Total number of docs (version 2 onwards). </li>
  *   <li> Integer: Compression type enum value (version 2 onwards). </li>
  *   <li> Integer: Start offset of data header (version 2 onwards). </li>
- *   <li> Integer array: Integer offsets for all chunks in the data .</li>
+ *   <li> Integer array: Integer offsets for all chunks in the data (upto 
version 2),
+ *   Long array: Long offsets for all chunks in the data (version 3 onwards) 
</li>
  * </ul>
  *
  * <p> Individual Chunks: </p>
@@ -53,7 +54,7 @@ import 
org.apache.pinot.core.io.compression.ChunkCompressorFactory;
 @NotThreadSafe
 public class FixedByteChunkSingleValueWriter extends 
BaseChunkSingleValueWriter {
 
-  private static final int CURRENT_VERSION = 2;
+  private static final int CURRENT_VERSION = 3;
   private int _chunkDataOffset;
 
   /**
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/VarByteChunkSingleValueWriter.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/VarByteChunkSingleValueWriter.java
index 950c003..8a2561b 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/VarByteChunkSingleValueWriter.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/io/writer/impl/v1/VarByteChunkSingleValueWriter.java
@@ -36,7 +36,11 @@ import 
org.apache.pinot.core.io.compression.ChunkCompressorFactory;
  *   <li> Integer: Total number of chunks. </li>
  *   <li> Integer: Number of docs per chunk. </li>
  *   <li> Integer: Length of longest entry (in bytes). </li>
- *   <li> Integer array: Integer offsets for all chunks in the data .</li>
+ *   <li> Integer: Total number of docs (version 2 onwards). </li>
+ *   <li> Integer: Compression type enum value (version 2 onwards). </li>
+ *   <li> Integer: Start offset of data header (version 2 onwards). </li>
+ *   <li> Integer array: Integer offsets for all chunks in the data (upto 
version 2),
+ *   Long array: Long offsets for all chunks in the data (version 3 onwards) 
</li>
  * </ul>
  *
  * <p> Individual Chunks: </p>
@@ -49,7 +53,7 @@ import 
org.apache.pinot.core.io.compression.ChunkCompressorFactory;
  */
 @NotThreadSafe
 public class VarByteChunkSingleValueWriter extends BaseChunkSingleValueWriter {
-  private static final int CURRENT_VERSION = 2;
+  private static final int CURRENT_VERSION = 3;
   public static final int CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE = Integer.BYTES;
 
   private final int _chunkHeaderSize;
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNativeOrderLBuffer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNativeOrderLBuffer.java
index cf7af29..8f8c97c 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNativeOrderLBuffer.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNativeOrderLBuffer.java
@@ -43,7 +43,7 @@ public class PinotNativeOrderLBuffer extends BasePinotLBuffer 
{
     return buffer;
   }
 
-  static PinotNativeOrderLBuffer mapFile(File file, boolean readOnly, long 
offset, long size)
+  public static PinotNativeOrderLBuffer mapFile(File file, boolean readOnly, 
long offset, long size)
       throws IOException {
     if (readOnly) {
       return new PinotNativeOrderLBuffer(new MMapBuffer(file, offset, size, 
MMapMode.READ_ONLY), true, false);
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNonNativeOrderLBuffer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNonNativeOrderLBuffer.java
index e14c784..b597686 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNonNativeOrderLBuffer.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/memory/PinotNonNativeOrderLBuffer.java
@@ -43,7 +43,7 @@ public class PinotNonNativeOrderLBuffer extends 
BasePinotLBuffer {
     return buffer;
   }
 
-  static PinotNonNativeOrderLBuffer mapFile(File file, boolean readOnly, long 
offset, long size)
+  public static PinotNonNativeOrderLBuffer mapFile(File file, boolean 
readOnly, long offset, long size)
       throws IOException {
     if (readOnly) {
       return new PinotNonNativeOrderLBuffer(new MMapBuffer(file, offset, size, 
MMapMode.READ_ONLY), true, false);
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/FixedByteChunkSingleValueReaderWriteTest.java
 
b/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/FixedByteChunkSingleValueReaderWriteTest.java
index 92d1f3a..b670486 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/FixedByteChunkSingleValueReaderWriteTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/FixedByteChunkSingleValueReaderWriteTest.java
@@ -260,25 +260,32 @@ public class FixedByteChunkSingleValueReaderWriteTest {
    * @throws IOException
    */
   @Test
-  public void testBackwardCompatibility()
-      throws IOException {
-    // Get v1 from resources folder
+  public void testBackwardCompatibilityV1()
+      throws Exception {
+    testBackwardCompatibilityHelper("data/fixedByteSVRDoubles.v1", 10009, 0);
+  }
+
+  @Test
+  public void testBackwardCompatibilityV2()
+      throws Exception {
+    testBackwardCompatibilityHelper("data/fixedByteCompressed.v2", 2000, 
100.2356);
+    testBackwardCompatibilityHelper("data/fixedByteRaw.v2", 2000, 100.2356);
+  }
+
+  private void testBackwardCompatibilityHelper(String fileName, int numDocs, 
double startValue)
+      throws Exception {
     ClassLoader classLoader = getClass().getClassLoader();
-    String fileName = "data/fixedByteSVRDoubles.v1";
     URL resource = classLoader.getResource(fileName);
     if (resource == null) {
       throw new RuntimeException("Input file not found: " + fileName);
     }
-
     File file = new File(resource.getFile());
     try (FixedByteChunkSingleValueReader reader = new 
FixedByteChunkSingleValueReader(
         PinotDataBuffer.mapReadOnlyBigEndianFile(file))) {
       ChunkReaderContext context = reader.createContext();
-
-      int numEntries = 10009; // Number of entries in the input file.
-      for (int i = 0; i < numEntries; i++) {
+      for (int i = 0; i < numDocs; i++) {
         double actual = reader.getDouble(i, context);
-        Assert.assertEquals(actual, (double) i);
+        Assert.assertEquals(actual, i + startValue);
       }
     }
   }
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/VarByteChunkSingleValueReaderWriteTest.java
 
b/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/VarByteChunkSingleValueReaderWriteTest.java
index 659bbb2..334a935 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/VarByteChunkSingleValueReaderWriteTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/index/readerwriter/VarByteChunkSingleValueReaderWriteTest.java
@@ -21,16 +21,20 @@ package org.apache.pinot.index.readerwriter;
 import java.io.File;
 import java.io.IOException;
 import java.net.URL;
+import java.nio.ByteOrder;
 import java.nio.charset.Charset;
 import java.util.Random;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.RandomStringUtils;
+import org.apache.pinot.common.utils.StringUtil;
 import org.apache.pinot.core.io.compression.ChunkCompressorFactory;
 import org.apache.pinot.core.io.reader.impl.ChunkReaderContext;
 import org.apache.pinot.core.io.reader.impl.v1.VarByteChunkSingleValueReader;
 import org.apache.pinot.core.io.writer.impl.v1.VarByteChunkSingleValueWriter;
 import 
org.apache.pinot.core.segment.creator.impl.fwd.SingleValueVarByteRawIndexCreator;
 import org.apache.pinot.core.segment.memory.PinotDataBuffer;
+import org.apache.pinot.core.segment.memory.PinotNativeOrderLBuffer;
+import org.apache.pinot.core.segment.memory.PinotNonNativeOrderLBuffer;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 
@@ -113,27 +117,37 @@ public class VarByteChunkSingleValueReaderWriteTest {
    * @throws IOException
    */
   @Test
-  public void testBackwardCompatibility()
-      throws IOException {
+  public void testBackwardCompatibilityV1()
+      throws Exception {
     String[] expected = new String[]{"abcde", "fgh", "ijklmn", "12345"};
+    testBackwardCompatibilityHelper("data/varByteStrings.v1", expected, 1009);
+  }
 
-    // Get v1 from resources folder
+  /**
+   * This test ensures that the reader can read in an data file from version 2.
+   */
+  @Test
+  public void testBackwardCompatibilityV2()
+      throws Exception {
+    String[] data = {"abcdefghijk", "12456887", "pqrstuv", "500"};
+    testBackwardCompatibilityHelper("data/varByteStringsCompressed.v2", data, 
1000);
+    testBackwardCompatibilityHelper("data/varByteStringsRaw.v2", data, 1000);
+  }
+
+  private void testBackwardCompatibilityHelper(String fileName, String[] data, 
int numDocs)
+      throws Exception {
     ClassLoader classLoader = getClass().getClassLoader();
-    String fileName = "data/varByteStrings.v1";
     URL resource = classLoader.getResource(fileName);
     if (resource == null) {
       throw new RuntimeException("Input file not found: " + fileName);
     }
-
     File file = new File(resource.getFile());
     try (VarByteChunkSingleValueReader reader = new 
VarByteChunkSingleValueReader(
         PinotDataBuffer.mapReadOnlyBigEndianFile(file))) {
       ChunkReaderContext context = reader.createContext();
-
-      int numEntries = 1009; // Number of entries in the input file.
-      for (int i = 0; i < numEntries; i++) {
+      for (int i = 0; i < numDocs; i++) {
         String actual = reader.getString(i, context);
-        Assert.assertEquals(actual, expected[i % expected.length]);
+        Assert.assertEquals(actual, data[i % data.length]);
       }
     }
   }
@@ -173,7 +187,7 @@ public class VarByteChunkSingleValueReaderWriteTest {
     int maxStringLengthInBytes = 0;
     for (int i = 0; i < numDocs; i++) {
       expected[i] = RandomStringUtils.random(random.nextInt(numChars));
-      maxStringLengthInBytes = Math.max(maxStringLengthInBytes, 
expected[i].getBytes(UTF_8).length);
+      maxStringLengthInBytes = Math.max(maxStringLengthInBytes, 
StringUtil.encodeUtf8(expected[i]).length);
     }
 
     int numDocsPerChunk = 
SingleValueVarByteRawIndexCreator.getNumDocsPerChunk(maxStringLengthInBytes);
@@ -183,20 +197,44 @@ public class VarByteChunkSingleValueReaderWriteTest {
 
     for (int i = 0; i < numDocs; i += 2) {
       writer.setString(i, expected[i]);
-      writer.setBytes(i + 1, expected[i].getBytes(UTF_8));
+      writer.setBytes(i + 1, StringUtil.encodeUtf8(expected[i]));
     }
 
     writer.close();
 
-    try (VarByteChunkSingleValueReader reader = new 
VarByteChunkSingleValueReader(
-        PinotDataBuffer.mapReadOnlyBigEndianFile(outFile))) {
+    PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(outFile);
+    try (VarByteChunkSingleValueReader reader = new 
VarByteChunkSingleValueReader(buffer)) {
       ChunkReaderContext context = reader.createContext();
+      for (int i = 0; i < numDocs; i += 2) {
+        String actual = reader.getString(i, context);
+        Assert.assertEquals(actual, expected[i]);
+        byte[] expectedBytes = StringUtil.encodeUtf8(expected[i]);
+        Assert.assertEquals(StringUtil.encodeUtf8(actual), expectedBytes);
+        Assert.assertEquals(reader.getBytes(i + 1, context), expectedBytes);
+      }
+    }
+
+    // For large variable width column values (where total size of data
+    // across all rows in the segment is > 2GB), LBuffer will be used for
+    // reading the fwd index. However, to test this scenario the unit test
+    // will take a long time to execute due to comparison
+    // (75000 characters in each row and 10000 rows will hit this scenario).
+    // So we specifically test for mapping the index file into a LBuffer
+    // to exercise the LBuffer code
+    if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+      buffer = PinotNativeOrderLBuffer.mapFile(outFile, true, 0, 
outFile.length());
+    } else {
+      buffer = PinotNonNativeOrderLBuffer.mapFile(outFile, true, 0, 
outFile.length());
+    }
 
+    try (VarByteChunkSingleValueReader reader = new 
VarByteChunkSingleValueReader(buffer)) {
+      ChunkReaderContext context = reader.createContext();
       for (int i = 0; i < numDocs; i += 2) {
         String actual = reader.getString(i, context);
         Assert.assertEquals(actual, expected[i]);
-        Assert.assertEquals(actual.getBytes(UTF_8), 
expected[i].getBytes(UTF_8));
-        Assert.assertEquals(reader.getBytes(i + 1), 
expected[i].getBytes(UTF_8));
+        byte[] expectedBytes = StringUtil.encodeUtf8(expected[i]);
+        Assert.assertEquals(StringUtil.encodeUtf8(actual), expectedBytes);
+        Assert.assertEquals(reader.getBytes(i + 1, context), expectedBytes);
       }
     }
 
diff --git a/pinot-core/src/test/resources/data/fixedByteCompressed.v2 
b/pinot-core/src/test/resources/data/fixedByteCompressed.v2
new file mode 100644
index 0000000..e2ee3a8
Binary files /dev/null and 
b/pinot-core/src/test/resources/data/fixedByteCompressed.v2 differ
diff --git a/pinot-core/src/test/resources/data/fixedByteRaw.v2 
b/pinot-core/src/test/resources/data/fixedByteRaw.v2
new file mode 100644
index 0000000..a9d6d34
Binary files /dev/null and b/pinot-core/src/test/resources/data/fixedByteRaw.v2 
differ
diff --git a/pinot-core/src/test/resources/data/varByteStringsCompressed.v2 
b/pinot-core/src/test/resources/data/varByteStringsCompressed.v2
new file mode 100644
index 0000000..ef3da91
Binary files /dev/null and 
b/pinot-core/src/test/resources/data/varByteStringsCompressed.v2 differ
diff --git a/pinot-core/src/test/resources/data/varByteStringsRaw.v2 
b/pinot-core/src/test/resources/data/varByteStringsRaw.v2
new file mode 100644
index 0000000..0f838dc
Binary files /dev/null and 
b/pinot-core/src/test/resources/data/varByteStringsRaw.v2 differ


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[incubator-pinot] branch master updated: Use 8byte offsets in chunk based raw index creator (#5285)

Reply via email to