[incubator-pinot] branch master updated: Add genericRow file reader and writer (#6997)

jackie Thu, 27 May 2021 23:11:00 -0700

This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new 6fcccfc  Add genericRow file reader and writer (#6997)
6fcccfc is described below

commit 6fcccfc39c0179b9ff856d5ebc11098dbaaaccf1
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Thu May 27 23:10:15 2021 -0700

    Add genericRow file reader and writer (#6997)
    
    Add file reader and writer for genericRow leveraging the ser/de classes
    Replace the usage of ser/de to the file reader/writer
---
 .../processing/collector/ConcatCollector.java      | 102 +++++++--------------
 .../GenericRowDeserializer.java                    |   2 +-
 .../genericrow/GenericRowFileReader.java           |  81 ++++++++++++++++
 .../genericrow/GenericRowFileWriter.java           |  72 +++++++++++++++
 .../GenericRowSerializer.java                      |   2 +-
 .../{serde => genericrow}/GenericRowSerDeTest.java |   2 +-
 6 files changed, 190 insertions(+), 71 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/collector/ConcatCollector.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/collector/ConcatCollector.java
index d760274..d7cc173 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/collector/ConcatCollector.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/collector/ConcatCollector.java
@@ -20,10 +20,7 @@ package org.apache.pinot.core.segment.processing.collector;
 
 import com.google.common.base.Preconditions;
 import it.unimi.dsi.fastutil.Arrays;
-import java.io.BufferedOutputStream;
 import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -31,9 +28,8 @@ import java.util.List;
 import javax.annotation.Nullable;
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.io.FileUtils;
-import org.apache.pinot.core.segment.processing.serde.GenericRowDeserializer;
-import org.apache.pinot.core.segment.processing.serde.GenericRowSerializer;
-import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import 
org.apache.pinot.core.segment.processing.genericrow.GenericRowFileReader;
+import 
org.apache.pinot.core.segment.processing.genericrow.GenericRowFileWriter;
 import org.apache.pinot.spi.data.FieldSpec;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.data.Schema;
@@ -44,26 +40,20 @@ import org.apache.pinot.spi.data.readers.GenericRow;
  * A Collector implementation for collecting and concatenating all incoming 
rows.
  */
 public class ConcatCollector implements Collector {
-  private static final String RECORDS_FILE_NAME = "collector.records";
+  private static final String RECORD_OFFSET_FILE_NAME = "record.offset";
+  private static final String RECORD_DATA_FILE_NAME = "record.data";
 
   private final List<FieldSpec> _fieldSpecs = new ArrayList<>();
-  private final GenericRowSerializer _genericRowSerializer;
   private final int _numSortColumns;
   private final SortOrderComparator _sortOrderComparator;
   private final File _workingDir;
-  private final File _collectorRecordFile;
+  private final File _recordOffsetFile;
+  private final File _recordDataFile;
 
+  private GenericRowFileWriter _recordFileWriter;
+  private GenericRowFileReader _recordFileReader;
   private int _numDocs;
 
-  // TODO: Avoid using BufferedOutputStream, and use ByteBuffer directly.
-  //  However, ByteBuffer has a limitation that the size cannot exceed 2G.
-  //  There are no limits on the size of data inserted into the {@link 
Collector}.
-  //  Hence, would need to implement a hybrid approach or a trigger a flush 
when size exceeds on Collector.
-  private BufferedOutputStream _collectorRecordOutputStream;
-  private List<Long> _collectorRecordOffsets;
-  private PinotDataBuffer _collectorRecordBuffer;
-  private GenericRowDeserializer _genericRowDeserializer;
-
   public ConcatCollector(CollectorConfig collectorConfig, Schema schema) {
     List<String> sortOrder = collectorConfig.getSortOrder();
     if (CollectionUtils.isNotEmpty(sortOrder)) {
@@ -92,48 +82,33 @@ public class ConcatCollector implements Collector {
         }
       }
     }
-    // TODO: Pass 'includeNullFields' from the config
-    _genericRowSerializer = new GenericRowSerializer(_fieldSpecs, true);
 
     _workingDir =
         new File(FileUtils.getTempDirectory(), 
String.format("concat_collector_%d", System.currentTimeMillis()));
     Preconditions.checkState(_workingDir.mkdirs(), "Failed to create dir: %s 
for %s with config: %s",
         _workingDir.getAbsolutePath(), ConcatCollector.class.getSimpleName(), 
collectorConfig);
-    _collectorRecordFile = new File(_workingDir, RECORDS_FILE_NAME);
-
-    initializeBuffer();
-  }
+    _recordOffsetFile = new File(_workingDir, RECORD_OFFSET_FILE_NAME);
+    _recordDataFile = new File(_workingDir, RECORD_DATA_FILE_NAME);
 
-  private void initializeBuffer() {
-    Preconditions.checkState(!_collectorRecordFile.exists(),
-        "Collector record file: " + _collectorRecordFile + " already exists");
     try {
-      _collectorRecordOutputStream = new BufferedOutputStream(new 
FileOutputStream(_collectorRecordFile));
-    } catch (FileNotFoundException e) {
-      throw new RuntimeException(e);
+      reset();
+    } catch (IOException e) {
+      throw new RuntimeException("Caught exception while resetting the 
collector", e);
     }
-    _collectorRecordOffsets = new ArrayList<>();
-    _collectorRecordOffsets.add(0L);
-    _numDocs = 0;
   }
 
   @Override
   public void collect(GenericRow genericRow)
       throws IOException {
-    byte[] genericRowBytes = _genericRowSerializer.serialize(genericRow);
-    _collectorRecordOutputStream.write(genericRowBytes);
-    _collectorRecordOffsets.add(_collectorRecordOffsets.get(_numDocs) + 
genericRowBytes.length);
+    _recordFileWriter.write(genericRow);
     _numDocs++;
   }
 
   @Override
   public Iterator<GenericRow> iterator()
       throws IOException {
-    _collectorRecordOutputStream.flush();
-    _collectorRecordBuffer = PinotDataBuffer
-        .mapFile(_collectorRecordFile, true, 0, 
_collectorRecordOffsets.get(_numDocs), PinotDataBuffer.NATIVE_ORDER,
-            "ConcatCollector: generic row buffer");
-    _genericRowDeserializer = new 
GenericRowDeserializer(_collectorRecordBuffer, _fieldSpecs, true);
+    _recordFileWriter.close();
+    _recordFileReader = new GenericRowFileReader(_recordOffsetFile, 
_recordDataFile, _fieldSpecs, true);
 
     // TODO: A lot of this code can be made common across Collectors, once 
{@link RollupCollector} is also converted to off heap implementation
     if (_numSortColumns != 0) {
@@ -142,12 +117,9 @@ public class ConcatCollector implements Collector {
         sortedDocIds[i] = i;
       }
 
-      Arrays.quickSort(0, _numDocs, (i1, i2) -> {
-        long startOffset1 = _collectorRecordOffsets.get(sortedDocIds[i1]);
-        long startOffset2 = _collectorRecordOffsets.get(sortedDocIds[i2]);
-        return 
_sortOrderComparator.compare(_genericRowDeserializer.partialDeserialize(startOffset1,
 _numSortColumns),
-            _genericRowDeserializer.partialDeserialize(startOffset2, 
_numSortColumns));
-      }, (i1, i2) -> {
+      Arrays.quickSort(0, _numDocs, (i1, i2) -> _sortOrderComparator
+          .compare(_recordFileReader.partialRead(sortedDocIds[i1], 
_numSortColumns),
+              _recordFileReader.partialRead(sortedDocIds[i2], 
_numSortColumns)), (i1, i2) -> {
         int temp = sortedDocIds[i1];
         sortedDocIds[i1] = sortedDocIds[i2];
         sortedDocIds[i2] = temp;
@@ -170,13 +142,8 @@ public class ConcatCollector implements Collector {
 
       @Override
       public GenericRow next() {
-        long offset;
-        if (sortedDocIds == null) {
-          offset = _collectorRecordOffsets.get(_nextDocId++);
-        } else {
-          offset = _collectorRecordOffsets.get(sortedDocIds[_nextDocId++]);
-        }
-        return _genericRowDeserializer.deserialize(offset, _reuse);
+        int docId = sortedDocIds != null ? sortedDocIds[_nextDocId++] : 
_nextDocId++;
+        return _recordFileReader.read(docId, _reuse);
       }
     };
   }
@@ -189,28 +156,27 @@ public class ConcatCollector implements Collector {
   @Override
   public void reset()
       throws IOException {
-    try {
-      if (_collectorRecordBuffer != null) {
-        _collectorRecordBuffer.close();
-      }
-      if (_collectorRecordOutputStream != null) {
-        _collectorRecordOutputStream.close();
-      }
-    } finally {
-      FileUtils.deleteQuietly(_collectorRecordFile);
+    if (_recordFileWriter != null) {
+      _recordFileWriter.close();
     }
-    initializeBuffer();
+    if (_recordFileReader != null) {
+      _recordFileReader.close();
+    }
+    FileUtils.cleanDirectory(_workingDir);
+    // TODO: Pass 'includeNullFields' from the config
+    _recordFileWriter = new GenericRowFileWriter(_recordOffsetFile, 
_recordDataFile, _fieldSpecs, true);
+    _numDocs = 0;
   }
 
   @Override
   public void close()
       throws IOException {
     try {
-      if (_collectorRecordBuffer != null) {
-        _collectorRecordBuffer.close();
+      if (_recordFileWriter != null) {
+        _recordFileWriter.close();
       }
-      if (_collectorRecordOutputStream != null) {
-        _collectorRecordOutputStream.close();
+      if (_recordFileReader != null) {
+        _recordFileReader.close();
       }
     } finally {
       FileUtils.deleteQuietly(_workingDir);
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/serde/GenericRowDeserializer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowDeserializer.java
similarity index 99%
rename from 
pinot-core/src/main/java/org/apache/pinot/core/segment/processing/serde/GenericRowDeserializer.java
rename to 
pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowDeserializer.java
index 83a5863..caee49b 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/serde/GenericRowDeserializer.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowDeserializer.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.pinot.core.segment.processing.serde;
+package org.apache.pinot.core.segment.processing.genericrow;
 
 import com.google.common.base.Preconditions;
 import java.util.List;
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowFileReader.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowFileReader.java
new file mode 100644
index 0000000..0e1ba1b
--- /dev/null
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowFileReader.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.processing.genericrow;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteOrder;
+import java.util.List;
+import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.readers.GenericRow;
+
+
+/**
+ * File reader for {@link GenericRow}. The input files should be generated by 
the {@link GenericRowFileWriter}.
+ */
+public class GenericRowFileReader implements Closeable {
+  private final int _numRows;
+  private final PinotDataBuffer _offsetBuffer;
+  private final PinotDataBuffer _dataBuffer;
+  private final GenericRowDeserializer _deserializer;
+
+  public GenericRowFileReader(File offsetFile, File dataFile, List<FieldSpec> 
fieldSpecs, boolean includeNullFields)
+      throws IOException {
+    long offsetFileLength = offsetFile.length();
+    _numRows = (int) (offsetFileLength >>> 3); // offsetFileLength / Long.BYTES
+    _offsetBuffer = PinotDataBuffer
+        .mapFile(offsetFile, true, 0L, offsetFileLength, ByteOrder.BIG_ENDIAN, 
"GenericRow offset buffer");
+    _dataBuffer = PinotDataBuffer
+        .mapFile(dataFile, true, 0L, dataFile.length(), 
PinotDataBuffer.NATIVE_ORDER, "GenericRow data buffer");
+    _deserializer = new GenericRowDeserializer(_dataBuffer, fieldSpecs, 
includeNullFields);
+  }
+
+  /**
+   * Returns the number of rows within the files.
+   */
+  public int getNumRows() {
+    return _numRows;
+  }
+
+  /**
+   * Reads the data of the given row id into the given reusable row.
+   */
+  public GenericRow read(int rowId, GenericRow reuse) {
+    long offset = _offsetBuffer.getLong((long) rowId << 3); // rowId * 
Long.BYTES
+    _deserializer.deserialize(offset, reuse);
+    return reuse;
+  }
+
+  /**
+   * Reads the first several fields of the given row id.
+   */
+  public Object[] partialRead(int rowId, int numFields) {
+    long offset = _offsetBuffer.getLong((long) rowId << 3);
+    return _deserializer.partialDeserialize(offset, numFields);
+  }
+
+  @Override
+  public void close()
+      throws IOException {
+    _offsetBuffer.close();
+    _dataBuffer.close();
+  }
+}
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowFileWriter.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowFileWriter.java
new file mode 100644
index 0000000..4f7957c
--- /dev/null
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowFileWriter.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.segment.processing.genericrow;
+
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.readers.GenericRow;
+
+
+/**
+ * File writer for {@link GenericRow}. The writer will generate to 2 files, 
one for the offsets (BIG_ENDIAN) and one for
+ * the actual data (NATIVE_ORDER). The generated files can be read by the 
{@link GenericRowFileReader}. There is no
+ * version control for the files generated because the files should only be 
used as intermediate format and read in the
+ * same host (different host might have different NATIVE_ORDER).
+ *
+ * TODO: Consider using ByteBuffer instead of OutputStream.
+ */
+public class GenericRowFileWriter implements Closeable {
+  private final DataOutputStream _offsetStream;
+  private final BufferedOutputStream _dataStream;
+  private final GenericRowSerializer _serializer;
+
+  private long _nextOffset;
+
+  public GenericRowFileWriter(File offsetFile, File dataFile, List<FieldSpec> 
fieldSpecs, boolean includeNullFields)
+      throws FileNotFoundException {
+    _offsetStream = new DataOutputStream(new BufferedOutputStream(new 
FileOutputStream(offsetFile)));
+    _dataStream = new BufferedOutputStream(new FileOutputStream(dataFile));
+    _serializer = new GenericRowSerializer(fieldSpecs, includeNullFields);
+  }
+
+  /**
+   * Writes the given row into the files.
+   */
+  public void write(GenericRow genericRow)
+      throws IOException {
+    _offsetStream.writeLong(_nextOffset);
+    byte[] bytes = _serializer.serialize(genericRow);
+    _dataStream.write(bytes);
+    _nextOffset += bytes.length;
+  }
+
+  @Override
+  public void close()
+      throws IOException {
+    _offsetStream.close();
+    _dataStream.close();
+  }
+}
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/serde/GenericRowSerializer.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowSerializer.java
similarity index 99%
rename from 
pinot-core/src/main/java/org/apache/pinot/core/segment/processing/serde/GenericRowSerializer.java
rename to 
pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowSerializer.java
index 7d70201..7549670 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/serde/GenericRowSerializer.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowSerializer.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.pinot.core.segment.processing.serde;
+package org.apache.pinot.core.segment.processing.genericrow;
 
 import java.nio.ByteBuffer;
 import java.util.HashMap;
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/core/segment/processing/serde/GenericRowSerDeTest.java
 
b/pinot-core/src/test/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowSerDeTest.java
similarity index 99%
rename from 
pinot-core/src/test/java/org/apache/pinot/core/segment/processing/serde/GenericRowSerDeTest.java
rename to 
pinot-core/src/test/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowSerDeTest.java
index fc2d8ab..cc85316 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/core/segment/processing/serde/GenericRowSerDeTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/core/segment/processing/genericrow/GenericRowSerDeTest.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.pinot.core.segment.processing.serde;
+package org.apache.pinot.core.segment.processing.genericrow;
 
 import java.util.Arrays;
 import java.util.Collections;

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[incubator-pinot] branch master updated: Add genericRow file reader and writer (#6997)

Reply via email to