clintropolis commented on code in PR #13803: URL: https://github.com/apache/druid/pull/13803#discussion_r1148815506
########## processing/src/main/java/org/apache/druid/segment/data/FrontCodedIntArrayIndexed.java: ########## @@ -0,0 +1,524 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.data; + +import com.google.common.base.Preconditions; +import com.google.common.base.Supplier; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; + +import javax.annotation.Nullable; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Collections; +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * {@link Indexed} specialized for storing int arrays, which must be sorted and unique, using 'front coding'. + * + * Front coding is a type of delta encoding, where sorted values are grouped into buckets. The first value of the bucket + * is written entirely, and remaining values are stored as a pair of an integer which indicates how much of the first + * int array of the bucket to use as a prefix, followed by the remaining ints after the prefix to complete the value. + * + * front coded indexed layout: + * | version | bucket size | has null? | number of values | size of "offsets" + "buckets" | "offsets" | "buckets" | + * | ------- | ----------- | --------- | ---------------- | ----------------------------- | --------- | --------- | + * | byte | byte | byte | vbyte int | vbyte int | int[] | bucket[] | + * + * "offsets" are the ending offsets of each bucket stored in order, stored as plain integers for easy random access. + * + * bucket layout: + * | first value | prefix length | fragment | ... | prefix length | fragment | + * | ----------- | ------------- | -------- | --- | ------------- | -------- | + * | int[] | vbyte int | int[] | ... | vbyte int | int[] | + * + * int array layout: + * | length | ints | + * | ----------- | ----- | + * | vbyte int | int[] | + * + * + * Getting a value first picks the appropriate bucket, finds its offset in the underlying buffer, then scans the bucket + * values to seek to the correct position of the value within the bucket in order to reconstruct it using the prefix + * length. + * + * Finding the index of a value involves binary searching the first values of each bucket to find the correct bucket, + * then a linear scan within the bucket to find the matching value (or negative insertion point -1 for values that + * are not present). + * + * The value iterator reads an entire bucket at a time, reconstructing the values into an array to iterate within the + * bucket before moving onto the next bucket as the iterator is consumed. + * + * This class is not thread-safe since during operation modifies positions of a shared buffer. + */ +public final class FrontCodedIntArrayIndexed implements Indexed<int[]> +{ + public static Supplier<FrontCodedIntArrayIndexed> read(ByteBuffer buffer, ByteOrder ordering) + { + final ByteBuffer orderedBuffer = buffer.asReadOnlyBuffer().order(ordering); + final byte version = orderedBuffer.get(); + Preconditions.checkArgument(version == 0, "only V0 exists, encountered " + version); + final int bucketSize = Byte.toUnsignedInt(orderedBuffer.get()); + final boolean hasNull = NullHandling.IS_NULL_BYTE == orderedBuffer.get(); + final int numValues = VByte.readInt(orderedBuffer); + // size of offsets + values + final int size = VByte.readInt(orderedBuffer); + final int offsetsPosition = orderedBuffer.position(); + // move position to end of buffer + buffer.position(offsetsPosition + size); + + return () -> new FrontCodedIntArrayIndexed( + buffer, + ordering, + bucketSize, + numValues, + hasNull, + offsetsPosition + ); + } + + private final ByteBuffer buffer; + private final int adjustedNumValues; + private final int adjustIndex; + private final int bucketSize; + private final int numBuckets; + private final int div; + private final int rem; + private final int offsetsPosition; + private final int bucketsPosition; + private final boolean hasNull; + private final int lastBucketNumValues; + + private FrontCodedIntArrayIndexed( + ByteBuffer buffer, + ByteOrder order, + int bucketSize, + int numValues, + boolean hasNull, + int offsetsPosition + ) + { + if (Integer.bitCount(bucketSize) != 1) { + throw new ISE("bucketSize must be a power of two but was[%,d]", bucketSize); + } + this.buffer = buffer.asReadOnlyBuffer().order(order); + this.bucketSize = bucketSize; + this.hasNull = hasNull; + + this.numBuckets = (int) Math.ceil((double) numValues / (double) bucketSize); + this.adjustIndex = hasNull ? 1 : 0; + this.adjustedNumValues = numValues + adjustIndex; + this.div = Integer.numberOfTrailingZeros(bucketSize); + this.rem = bucketSize - 1; + this.lastBucketNumValues = (numValues & rem) == 0 ? bucketSize : numValues & rem; + this.offsetsPosition = offsetsPosition; + this.bucketsPosition = offsetsPosition + ((numBuckets - 1) * Integer.BYTES); + } + + @Override + public int size() + { + return adjustedNumValues; + } + + @Nullable + @Override + public int[] get(int index) + { + if (hasNull && index == 0) { + return null; + } + Indexed.checkIndex(index, adjustedNumValues); + + // due to vbyte encoding, the null value is not actually stored in the bucket (no negative values), so we adjust + // the index Review Comment: this is like the same code/comments as in `FrontCodedIndexed`, just adapted for `int[]` instead of `byte[]`, what is the empty array here is the empty string there, so it is the same problem. I guess i can update the comments/javadocs on both of them to make it more apparent of why we special handle null values and how it relates to `VByte` encoding -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
