clintropolis commented on a change in pull request #6016: Druid 'Shapeshifting' Columns URL: https://github.com/apache/incubator-druid/pull/6016#discussion_r207380790
########## File path: processing/src/main/java/io/druid/segment/data/ShapeShiftingColumnSerializer.java ########## @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.segment.data; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Maps; +import com.google.common.primitives.Ints; +import io.druid.java.util.common.io.smoosh.FileSmoosher; +import io.druid.java.util.common.logger.Logger; +import io.druid.segment.IndexSpec; +import io.druid.segment.data.codecs.CompressedFormEncoder; +import io.druid.segment.data.codecs.FormEncoder; +import io.druid.segment.data.codecs.FormMetrics; +import io.druid.segment.serde.Serializer; +import io.druid.segment.writeout.SegmentWriteOutMedium; +import io.druid.segment.writeout.WriteOutBytes; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.WritableByteChannel; +import java.util.Map; +import java.util.function.Function; + +/** + * Base serializer for {@link ShapeShiftingColumn} implementations, providing most common functionality such as headers, + * value-chunking, encoder selection, and writing out values. + * + * Encoding Selection: + * The intention of this base structure is that implementors of this class will analyze incoming values and aggregate + * facts about the data which matching {@link FormEncoder} implementations might find interesting, while storing raw, + * unencoded values in {@link ShapeShiftingColumnSerializer#currentChunk}. When the threshold of + * {@link ShapeShiftingColumnSerializer#valuesPerChunk} is reached, {@link ShapeShiftingColumnSerializer} will attempt + * to find the "best" encoding by first computing the encoded size with + * {@link FormEncoder#getEncodedSize} and then applying a modifier to scale this value in order to influence behavior + * when sizes are relatively close according to the chosen {@link IndexSpec.ShapeShiftOptimizationTarget}. This + * effectively sways the decision towards using encodings with faster decoding speed or smaller encoded size as + * appropriate. Note that very often the best encoding is unambiguous and these settings don't matter, the nuanced + * differences of behavior of {@link IndexSpec.ShapeShiftOptimizationTarget} mainly come into play when things are + * close. + * + * Implementors need only supply an initialize method to allocate storage for {@code <TChunk>}, an add value method to + * populate {@code <TChunk>}, a reset method to prepare {@code <TChunkMetrics>} for the next chunk after a flush, and + * of course, matching {@link FormEncoder} implementations to perform actual value encoding. Generic compression is + * available to {@link FormEncoder} implementations by implementing + * {@link io.druid.segment.data.codecs.CompressibleFormEncoder} and wrapping in a + * {@link io.druid.segment.data.codecs.CompressedFormEncoder} in the codec list passed to the serializer. + * + * layout: + * | version (byte) | headerSize (int) | numValues (int) | numChunks (int) | logValuesPerChunk (byte) | offsetsOutSize (int) | compositionSize (int) | composition | offsets | values | + * + * @param <TChunk> + * @param <TChunkMetrics> + */ +public abstract class ShapeShiftingColumnSerializer<TChunk, TChunkMetrics extends FormMetrics> implements Serializer +{ + /** + * | version (byte) | headerSize (int) | numValues (int) | numChunks (int) | logValuesPerChunk (byte) | offsetsOutSize (int) | compositionSize (int) | + */ + private static final int BASE_HEADER_BYTES = 1 + (3 * Integer.BYTES) + 1 + (2 * Integer.BYTES); + + private static Logger log = new Logger(ShapeShiftingColumnSerializer.class); + + protected final SegmentWriteOutMedium segmentWriteOutMedium; + protected final FormEncoder<TChunk, TChunkMetrics>[] codecs; + protected final byte version; + protected final byte logValuesPerChunk; + protected final int valuesPerChunk; + protected final ByteBuffer intToBytesHelperBuffer; + protected final Map<FormEncoder, Integer> composition; Review comment: 👍 will look into this ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
