clintropolis commented on code in PR #18722: URL: https://github.com/apache/druid/pull/18722#discussion_r2599826921
########## processing/src/main/java/org/apache/druid/segment/column/BitmapIndexEncodingStrategy.java: ########## @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.column; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.collections.bitmap.BitmapFactory; +import org.apache.druid.collections.bitmap.ImmutableBitmap; +import org.apache.druid.collections.bitmap.MutableBitmap; +import org.apache.druid.error.DruidException; +import org.apache.druid.segment.data.GenericIndexedWriter; +import org.apache.druid.segment.file.SegmentFileBuilder; +import org.apache.druid.segment.serde.Serializer; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.Objects; + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.DictionaryId.class, name = "dictionaryId"), + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.NullsOnly.class, name = "nullsOnly") Review Comment: i think lets call these `dictionaryEncodedValuesIndex` and `nullValueIndex` to be consistent with the internal class names that use them on the other side (`DictionaryEncodedValueIndex` and `NullValueIndex`) ########## processing/src/test/java/org/apache/druid/query/NestedDataTestUtils.java: ########## @@ -307,76 +160,144 @@ public static List<Segment> createSegmentsWithConcatenatedJsonInput( File file = selfConcatenateResourceFile(tempFolder, inputFile, numCopies); inputFiles.add(new LocalInputSource(file.getParentFile(), file.getName())); } - return createSegments( - tempFolder, - closer, - inputFiles, - TestIndex.DEFAULT_JSON_INPUT_FORMAT, - TIMESTAMP_SPEC, - AUTO_DISCOVERY, - TransformSpec.NONE, - COUNT, - granularity, - rollup, - IndexSpec.getDefault() - ); + return new SegmentBuilder(tempFolder, closer).inputSources(inputFiles) + .granularity(granularity) + .rollup(rollup) + .build(); } - public static List<Segment> createSegmentsForJsonInput( - TemporaryFolder tempFolder, - Closer closer, - String inputFile, - IndexSpec indexSpec - ) - throws Exception + public static class SegmentBuilder Review Comment: I think we should call this `ResourceFileSegmentBuilder` so it is more obvious that this is a wrapper around `IndexBuilder` to streamline using it with resource files. ########## processing/src/main/java/org/apache/druid/segment/nested/NestedCommonFormatColumnFormatSpec.java: ########## @@ -172,14 +184,18 @@ public static NestedCommonFormatColumnFormatSpec getEffectiveFormatSpec( private final CompressionStrategy doubleColumnCompression; @Nullable private final BitmapSerdeFactory bitmapEncoding; + @Nullable + private final BitmapIndexEncodingStrategy numericFieldsBitmapIndexEncoding; @JsonCreator public NestedCommonFormatColumnFormatSpec( @JsonProperty("objectFieldsDictionaryEncoding") @Nullable StringEncodingStrategy objectFieldsDictionaryEncoding, + @JsonProperty("numericFieldsBitmapIndexEncoding") @Nullable BitmapIndexEncodingStrategy numericFieldsBitmapIndexEncoding, Review Comment: this should be separately controllable for long and double fields, also I don't think 'encoding' is quite the correct thing to call this as the type of bitmap encoding itself is controlled by `IndexSpec`. How about something like `longFieldIndexType` and `doubleFieldIndexType` since these are controlling the type of indexes we build for the field. ########## processing/src/main/java/org/apache/druid/segment/column/BitmapIndexEncodingStrategy.java: ########## @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.column; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.collections.bitmap.BitmapFactory; +import org.apache.druid.collections.bitmap.ImmutableBitmap; +import org.apache.druid.collections.bitmap.MutableBitmap; +import org.apache.druid.error.DruidException; +import org.apache.druid.segment.data.GenericIndexedWriter; +import org.apache.druid.segment.file.SegmentFileBuilder; +import org.apache.druid.segment.serde.Serializer; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.Objects; + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.DictionaryId.class, name = "dictionaryId"), + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.NullsOnly.class, name = "nullsOnly") +}) +public abstract class BitmapIndexEncodingStrategy implements Serializer +{ + /** + * Assigned in {@link #init(BitmapFactory, int)} + */ + @Nullable + protected MutableBitmap[] bitmaps; + /** + * Assigned in {@link #close(BitmapFactory, GenericIndexedWriter)} + */ + @Nullable + GenericIndexedWriter<ImmutableBitmap> writer; + + public abstract void init(BitmapFactory bitmapFactory, int dictionarySize); + + public abstract void add(int row, int sortedId, @Nullable Object o); + + public void close(BitmapFactory bitmapFactory, GenericIndexedWriter<ImmutableBitmap> writer) throws IOException Review Comment: nit: this method name is kind of confusing, maybe calling it `closeForWrite` or `writeToWriter` or something would be clearer, since this writing to the writer that is passed in, and `getSerializedSize` and `writeTo` will be called after this method (which feels wrong after calling close on something) That said, I have a bunch of a changes I want to make to this abstraction, so is fine to leave as is too since I kind of want to change these interfaces a lot in a follow-up ########## processing/src/main/java/org/apache/druid/segment/column/BitmapIndexEncodingStrategy.java: ########## @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.column; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.collections.bitmap.BitmapFactory; +import org.apache.druid.collections.bitmap.ImmutableBitmap; +import org.apache.druid.collections.bitmap.MutableBitmap; +import org.apache.druid.error.DruidException; +import org.apache.druid.segment.data.GenericIndexedWriter; +import org.apache.druid.segment.file.SegmentFileBuilder; +import org.apache.druid.segment.serde.Serializer; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.Objects; + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.DictionaryId.class, name = "dictionaryId"), + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.NullsOnly.class, name = "nullsOnly") +}) +public abstract class BitmapIndexEncodingStrategy implements Serializer +{ + /** + * Assigned in {@link #init(BitmapFactory, int)} + */ + @Nullable + protected MutableBitmap[] bitmaps; + /** + * Assigned in {@link #close(BitmapFactory, GenericIndexedWriter)} + */ + @Nullable + GenericIndexedWriter<ImmutableBitmap> writer; + + public abstract void init(BitmapFactory bitmapFactory, int dictionarySize); + + public abstract void add(int row, int sortedId, @Nullable Object o); + + public void close(BitmapFactory bitmapFactory, GenericIndexedWriter<ImmutableBitmap> writer) throws IOException + { + if (bitmaps == null) { + throw DruidException.defensive("Not initiated yet"); + } + this.writer = writer; + for (int i = 0; i < bitmaps.length; i++) { + writer.write(bitmapFactory.makeImmutableBitmap(bitmaps[i])); + bitmaps[i] = null; // Reclaim memory + } + bitmaps = null; + } + + @Override + public long getSerializedSize() + { + return writer.getSerializedSize(); + } + + @Override + public void writeTo(WritableByteChannel channel, SegmentFileBuilder fileBuilder) throws IOException + { + writer.writeTo(channel, fileBuilder); + } Review Comment: these methods should maybe defensively check that writer has been set (the method currently called `close` has been called) ########## processing/src/main/java/org/apache/druid/segment/nested/GlobalDictionaryEncodedFieldColumnWriter.java: ########## @@ -272,21 +272,21 @@ public long getSerializedSize() throws IOException } else { arraySize = 0; } - return 1 + Integer.BYTES + + return 1 + Integer.BYTES + // version + feature flags sortedDictionaryWriter.getSerializedSize() + - bitmapIndexWriter.getSerializedSize() + - arraySize + - getSerializedColumnSize(); + getSerializedColumnSize() + + bitmapIndexEncoding.getSerializedSize() + + arraySize; } @Override public void writeTo(WritableByteChannel channel, SegmentFileBuilder fileBuilder) throws IOException { Channels.writeFully(channel, ByteBuffer.wrap(new byte[]{version.asByte()})); - channel.write(ByteBuffer.wrap(Ints.toByteArray(flags))); + Channels.writeFully(channel, ByteBuffer.wrap(Ints.toByteArray(flags))); Review Comment: was this causing a problem? ########## processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnV5.java: ########## @@ -61,7 +60,7 @@ public NestedDataColumnV5( Supplier<FixedIndexed<Double>> doubleDictionarySupplier, Supplier<FrontCodedIntArrayIndexed> arrayDictionarySupplier, SegmentFileMapper fileMapper, - BitmapSerdeFactory bitmapSerdeFactory, + NestedCommonFormatColumnFormatSpec bitmapSerdeFactory, Review Comment: nit: variable name should be `columnFormatSpec` ########## processing/src/main/java/org/apache/druid/segment/nested/GlobalDictionaryEncodedFieldColumnWriter.java: ########## @@ -187,7 +189,11 @@ long getSerializedColumnSize() throws IOException * Defines how to write the column, including the dictionary id column, along with any additional columns * such as the long or double value column as type appropriate. */ - abstract void writeColumnTo(WritableByteChannel channel, SegmentFileBuilder fileBuilder) throws IOException; + void writeColumnTo(WritableByteChannel channel, SegmentFileBuilder fileBuilder) throws IOException + { + writeLongAndDoubleColumnLength(channel, 0, 0); + encodedValueSerializer.writeTo(channel, fileBuilder); + } Review Comment: why remove the abstraction here? I made it abstract on purpose to ensure that every field implements this method since it is pretty important to be correct ########## processing/src/main/java/org/apache/druid/segment/column/BitmapIndexEncodingStrategy.java: ########## @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.column; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.collections.bitmap.BitmapFactory; +import org.apache.druid.collections.bitmap.ImmutableBitmap; +import org.apache.druid.collections.bitmap.MutableBitmap; +import org.apache.druid.error.DruidException; +import org.apache.druid.segment.data.GenericIndexedWriter; +import org.apache.druid.segment.file.SegmentFileBuilder; +import org.apache.druid.segment.serde.Serializer; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.Objects; + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.DictionaryId.class, name = "dictionaryId"), + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.NullsOnly.class, name = "nullsOnly") +}) +public abstract class BitmapIndexEncodingStrategy implements Serializer +{ + /** + * Assigned in {@link #init(BitmapFactory, int)} + */ + @Nullable + protected MutableBitmap[] bitmaps; + /** + * Assigned in {@link #close(BitmapFactory, GenericIndexedWriter)} + */ + @Nullable + GenericIndexedWriter<ImmutableBitmap> writer; + + public abstract void init(BitmapFactory bitmapFactory, int dictionarySize); + + public abstract void add(int row, int sortedId, @Nullable Object o); + + public void close(BitmapFactory bitmapFactory, GenericIndexedWriter<ImmutableBitmap> writer) throws IOException + { + if (bitmaps == null) { + throw DruidException.defensive("Not initiated yet"); + } + this.writer = writer; + for (int i = 0; i < bitmaps.length; i++) { + writer.write(bitmapFactory.makeImmutableBitmap(bitmaps[i])); + bitmaps[i] = null; // Reclaim memory + } + bitmaps = null; + } + + @Override + public long getSerializedSize() + { + return writer.getSerializedSize(); + } + + @Override + public void writeTo(WritableByteChannel channel, SegmentFileBuilder fileBuilder) throws IOException + { + writer.writeTo(channel, fileBuilder); + } + + public static class DictionaryId extends BitmapIndexEncodingStrategy + { + public static final DictionaryId INSTANCE = new DictionaryId(); + + @Override + public void init(BitmapFactory bitmapFactory, int dictionarySize) + { + bitmaps = new MutableBitmap[dictionarySize]; + for (int index = 0; index < dictionarySize; index++) { + bitmaps[index] = bitmapFactory.makeEmptyMutableBitmap(); + } + } + + @Override + public void add(int row, int sortedId, @Nullable Object o) + { + bitmaps[sortedId].add(row); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + return o != null && getClass() == o.getClass(); + } + + @Override + public int hashCode() + { + return Objects.hashCode(getClass()); + } + + @Override + public String toString() + { + return "DictionaryId"; Review Comment: nit: maybe use the same json type name string? ########## processing/src/main/java/org/apache/druid/segment/nested/NestedCommonFormatColumnFormatSpec.java: ########## @@ -85,6 +90,13 @@ public static NestedCommonFormatColumnFormatSpec getEffectiveFormatSpec( defaultSpec = DEFAULT; } + if (builder.numericFieldsBitmapIndexEncoding == null) { + builder.setNumericFieldsBitmapIndexEncoding(GuavaUtils.firstNonNull( + defaultSpec.getNumericFieldsBitmapIndexEncoding(), + DEFAULT.getNumericFieldsBitmapIndexEncoding() + )); Review Comment: nit: this would be more consistent with everything else in this method if it just used regular if/else... or I guess could change everything else, but i find the if/else a bit more readable/clearer for something with a lot of options resolving defaults like this.. ########## processing/src/main/java/org/apache/druid/segment/column/BitmapIndexEncodingStrategy.java: ########## @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.column; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import org.apache.druid.collections.bitmap.BitmapFactory; +import org.apache.druid.collections.bitmap.ImmutableBitmap; +import org.apache.druid.collections.bitmap.MutableBitmap; +import org.apache.druid.error.DruidException; +import org.apache.druid.segment.data.GenericIndexedWriter; +import org.apache.druid.segment.file.SegmentFileBuilder; +import org.apache.druid.segment.serde.Serializer; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.util.Objects; + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.DictionaryId.class, name = "dictionaryId"), + @JsonSubTypes.Type(value = BitmapIndexEncodingStrategy.NullsOnly.class, name = "nullsOnly") +}) +public abstract class BitmapIndexEncodingStrategy implements Serializer +{ + /** + * Assigned in {@link #init(BitmapFactory, int)} + */ + @Nullable + protected MutableBitmap[] bitmaps; + /** + * Assigned in {@link #close(BitmapFactory, GenericIndexedWriter)} + */ + @Nullable + GenericIndexedWriter<ImmutableBitmap> writer; + + public abstract void init(BitmapFactory bitmapFactory, int dictionarySize); + + public abstract void add(int row, int sortedId, @Nullable Object o); + + public void close(BitmapFactory bitmapFactory, GenericIndexedWriter<ImmutableBitmap> writer) throws IOException + { + if (bitmaps == null) { + throw DruidException.defensive("Not initiated yet"); + } + this.writer = writer; + for (int i = 0; i < bitmaps.length; i++) { + writer.write(bitmapFactory.makeImmutableBitmap(bitmaps[i])); + bitmaps[i] = null; // Reclaim memory + } + bitmaps = null; + } + + @Override + public long getSerializedSize() + { + return writer.getSerializedSize(); + } + + @Override + public void writeTo(WritableByteChannel channel, SegmentFileBuilder fileBuilder) throws IOException + { + writer.writeTo(channel, fileBuilder); + } + + public static class DictionaryId extends BitmapIndexEncodingStrategy + { + public static final DictionaryId INSTANCE = new DictionaryId(); + + @Override + public void init(BitmapFactory bitmapFactory, int dictionarySize) + { + bitmaps = new MutableBitmap[dictionarySize]; + for (int index = 0; index < dictionarySize; index++) { + bitmaps[index] = bitmapFactory.makeEmptyMutableBitmap(); + } + } + + @Override + public void add(int row, int sortedId, @Nullable Object o) + { + bitmaps[sortedId].add(row); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + return o != null && getClass() == o.getClass(); + } + + @Override + public int hashCode() + { + return Objects.hashCode(getClass()); + } + + @Override + public String toString() + { + return "DictionaryId"; + } + } + + public static class NullsOnly extends BitmapIndexEncodingStrategy Review Comment: its kind of weird that the null index needs a `GenericIndexedWriter` to write only a single bitmap... I think these should just write the bitmap like other implementations of stuff that gets wired up to `NullValueIndex`, which just use a `ByteBufferWriter` to write the blob (see `LongColumnSerializerV2`, `DoubleColumnSerializerV2`, `NestedDataColumnSerializer`, etc) I see why it is like this, so you could have some shared code, but I think as we add other types of indexes there will be less and less shared code and having an abstract base type isn't really the correct abstraction. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
