clintropolis commented on a change in pull request #6016: Druid 'Shapeshifting' 
Columns
URL: https://github.com/apache/incubator-druid/pull/6016#discussion_r207380790
 
 

 ##########
 File path: 
processing/src/main/java/io/druid/segment/data/ShapeShiftingColumnSerializer.java
 ##########
 @@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package io.druid.segment.data;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Maps;
+import com.google.common.primitives.Ints;
+import io.druid.java.util.common.io.smoosh.FileSmoosher;
+import io.druid.java.util.common.logger.Logger;
+import io.druid.segment.IndexSpec;
+import io.druid.segment.data.codecs.CompressedFormEncoder;
+import io.druid.segment.data.codecs.FormEncoder;
+import io.druid.segment.data.codecs.FormMetrics;
+import io.druid.segment.serde.Serializer;
+import io.druid.segment.writeout.SegmentWriteOutMedium;
+import io.druid.segment.writeout.WriteOutBytes;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.WritableByteChannel;
+import java.util.Map;
+import java.util.function.Function;
+
+/**
+ * Base serializer for {@link ShapeShiftingColumn} implementations, providing 
most common functionality such as headers,
+ * value-chunking, encoder selection, and writing out values.
+ *
+ * Encoding Selection:
+ * The intention of this base structure is that implementors of this class 
will analyze incoming values and aggregate
+ * facts about the data which matching {@link FormEncoder} implementations 
might find interesting, while storing raw,
+ * unencoded values in {@link ShapeShiftingColumnSerializer#currentChunk}. 
When the threshold of
+ * {@link ShapeShiftingColumnSerializer#valuesPerChunk} is reached, {@link 
ShapeShiftingColumnSerializer} will attempt
+ * to find the "best" encoding by first computing the encoded size with
+ * {@link FormEncoder#getEncodedSize} and then applying a modifier to scale 
this value in order to influence behavior
+ * when sizes are relatively close according to the chosen {@link 
IndexSpec.ShapeShiftOptimizationTarget}. This
+ * effectively sways the decision towards using encodings with faster decoding 
speed or smaller encoded size as
+ * appropriate. Note that very often the best encoding is unambiguous and 
these settings don't matter, the nuanced
+ * differences of behavior of {@link IndexSpec.ShapeShiftOptimizationTarget} 
mainly come into play when things are
+ * close.
+ *
+ * Implementors need only supply an initialize method to allocate storage for 
{@code <TChunk>}, an add value method to
+ * populate {@code <TChunk>}, a reset method to prepare {@code 
<TChunkMetrics>} for the next chunk after a flush, and
+ * of course, matching {@link FormEncoder} implementations to perform actual 
value encoding. Generic compression is
+ * available to {@link FormEncoder} implementations by implementing
+ * {@link io.druid.segment.data.codecs.CompressibleFormEncoder} and wrapping 
in a
+ * {@link io.druid.segment.data.codecs.CompressedFormEncoder} in the codec 
list passed to the serializer.
+ *
+ * layout:
+ * | version (byte) | headerSize (int) | numValues (int) | numChunks (int) | 
logValuesPerChunk (byte) | offsetsOutSize (int) |  compositionSize (int) | 
composition | offsets | values |
+ *
+ * @param <TChunk>
+ * @param <TChunkMetrics>
+ */
+public abstract class ShapeShiftingColumnSerializer<TChunk, TChunkMetrics 
extends FormMetrics> implements Serializer
+{
+  /**
+   * | version (byte) | headerSize (int) | numValues (int) | numChunks (int) | 
logValuesPerChunk (byte) | offsetsOutSize (int) | compositionSize (int) |
+   */
+  private static final int BASE_HEADER_BYTES = 1 + (3 * Integer.BYTES) + 1 + 
(2 * Integer.BYTES);
+
+  private static Logger log = new Logger(ShapeShiftingColumnSerializer.class);
+
+  protected final SegmentWriteOutMedium segmentWriteOutMedium;
+  protected final FormEncoder<TChunk, TChunkMetrics>[] codecs;
+  protected final byte version;
+  protected final byte logValuesPerChunk;
+  protected final int valuesPerChunk;
+  protected final ByteBuffer intToBytesHelperBuffer;
+  protected final Map<FormEncoder, Integer> composition;
 
 Review comment:
   👍 will look into this

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to