clintropolis commented on code in PR #14919:
URL: https://github.com/apache/druid/pull/14919#discussion_r1311073281
##########
processing/src/main/java/org/apache/druid/segment/nested/DictionaryIdLookup.java:
##########
@@ -19,114 +19,308 @@
package org.apache.druid.segment.nested;
-import com.google.common.base.Preconditions;
-import it.unimi.dsi.fastutil.doubles.Double2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.doubles.Double2IntMap;
-import it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.longs.Long2IntMap;
-import it.unimi.dsi.fastutil.objects.Object2IntAVLTreeMap;
-import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.objects.Object2IntMap;
-import org.apache.druid.segment.data.FrontCodedIntArrayIndexedWriter;
+import com.google.common.primitives.Ints;
+import org.apache.druid.annotations.SuppressFBWarnings;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.java.util.common.ByteBufferUtils;
+import org.apache.druid.java.util.common.FileUtils;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.io.smoosh.FileSmoosher;
+import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
+import org.apache.druid.java.util.common.io.smoosh.SmooshedWriter;
+import org.apache.druid.segment.column.StringEncodingStrategies;
+import org.apache.druid.segment.column.TypeStrategies;
+import org.apache.druid.segment.data.DictionaryWriter;
+import org.apache.druid.segment.data.FixedIndexed;
+import org.apache.druid.segment.data.FrontCodedIntArrayIndexed;
+import org.apache.druid.segment.data.Indexed;
import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.GatheringByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.EnumSet;
/**
- * Ingestion time dictionary identifier lookup, used by {@link
NestedCommonFormatColumnSerializer} to build a global
- * dictionary id to value mapping for the 'stacked' global value dictionaries.
+ * Value to dictionary id lookup, backed with memory mapped dictionaries
populated lazily by the supplied
+ * @link DictionaryWriter}.
*/
-public class DictionaryIdLookup
+public final class DictionaryIdLookup implements Closeable
{
- private final Object2IntMap<String> stringLookup;
+ private final String name;
+ @Nullable
+ private final DictionaryWriter<String> stringDictionaryWriter;
+ private SmooshedFileMapper stringBufferMapper = null;
+ private Indexed<ByteBuffer> stringDictionary = null;
- private final Long2IntMap longLookup;
+ @Nullable
+ private final DictionaryWriter<Long> longDictionaryWriter;
+ private MappedByteBuffer longBuffer = null;
+ private FixedIndexed<Long> longDictionary = null;
- private final Double2IntMap doubleLookup;
+ @Nullable
+ private final DictionaryWriter<Double> doubleDictionaryWriter;
+ MappedByteBuffer doubleBuffer = null;
+ FixedIndexed<Double> doubleDictionary = null;
- private final Object2IntMap<int[]> arrayLookup;
+ @Nullable
+ private final DictionaryWriter<int[]> arrayDictionaryWriter;
+ private MappedByteBuffer arrayBuffer = null;
+ private FrontCodedIntArrayIndexed arrayDictionary = null;
- private int dictionarySize;
+ public DictionaryIdLookup(
+ String name,
+ @Nullable DictionaryWriter<String> stringDictionaryWriter,
+ @Nullable DictionaryWriter<Long> longDictionaryWriter,
+ @Nullable DictionaryWriter<Double> doubleDictionaryWriter,
+ @Nullable DictionaryWriter<int[]> arrayDictionaryWriter
+ )
+ {
+ this.name = name;
+ this.stringDictionaryWriter = stringDictionaryWriter;
+ this.longDictionaryWriter = longDictionaryWriter;
+ this.doubleDictionaryWriter = doubleDictionaryWriter;
+ this.arrayDictionaryWriter = arrayDictionaryWriter;
+ }
- public DictionaryIdLookup()
+ public int lookupString(@Nullable String value)
{
- this.stringLookup = new Object2IntLinkedOpenHashMap<>();
- stringLookup.defaultReturnValue(-1);
- this.longLookup = new Long2IntLinkedOpenHashMap();
- longLookup.defaultReturnValue(-1);
- this.doubleLookup = new Double2IntLinkedOpenHashMap();
- doubleLookup.defaultReturnValue(-1);
- this.arrayLookup = new
Object2IntAVLTreeMap<>(FrontCodedIntArrayIndexedWriter.ARRAY_COMPARATOR);
- this.arrayLookup.defaultReturnValue(-1);
+ if (stringDictionary == null) {
Review Comment:
what metric do you have in mind? Also why 2gb instead of just a metric on
the size of the components? Also shouldn't such a metric really be much broader
in scale and not just apply to nested columns? This doesn't feel like either a
blocker or something that should be resolved in this PR, but agree it does
sound potentially nice to have a better breakdown on what is driving segment
size.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]