Re: [PR] use mmap for nested column value to dictionary id lookup for more chill heap usage during serialization (druid)

via GitHub Wed, 30 Aug 2023 22:23:22 -0700


clintropolis commented on code in PR #14919:
URL: https://github.com/apache/druid/pull/14919#discussion_r1311073281



##########
processing/src/main/java/org/apache/druid/segment/nested/DictionaryIdLookup.java:
##########
@@ -19,114 +19,308 @@
 
 package org.apache.druid.segment.nested;
 
-import com.google.common.base.Preconditions;
-import it.unimi.dsi.fastutil.doubles.Double2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.doubles.Double2IntMap;
-import it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.longs.Long2IntMap;
-import it.unimi.dsi.fastutil.objects.Object2IntAVLTreeMap;
-import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.objects.Object2IntMap;
-import org.apache.druid.segment.data.FrontCodedIntArrayIndexedWriter;
+import com.google.common.primitives.Ints;
+import org.apache.druid.annotations.SuppressFBWarnings;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.java.util.common.ByteBufferUtils;
+import org.apache.druid.java.util.common.FileUtils;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.io.smoosh.FileSmoosher;
+import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
+import org.apache.druid.java.util.common.io.smoosh.SmooshedWriter;
+import org.apache.druid.segment.column.StringEncodingStrategies;
+import org.apache.druid.segment.column.TypeStrategies;
+import org.apache.druid.segment.data.DictionaryWriter;
+import org.apache.druid.segment.data.FixedIndexed;
+import org.apache.druid.segment.data.FrontCodedIntArrayIndexed;
+import org.apache.druid.segment.data.Indexed;
 
 import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.GatheringByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.EnumSet;
 
 /**
- * Ingestion time dictionary identifier lookup, used by {@link 
NestedCommonFormatColumnSerializer} to build a global
- * dictionary id to value mapping for the 'stacked' global value dictionaries.
+ * Value to dictionary id lookup, backed with memory mapped dictionaries 
populated lazily by the supplied
+ * @link DictionaryWriter}.
  */
-public class DictionaryIdLookup
+public final class DictionaryIdLookup implements Closeable
 {
-  private final Object2IntMap<String> stringLookup;
+  private final String name;
+  @Nullable
+  private final DictionaryWriter<String> stringDictionaryWriter;
+  private SmooshedFileMapper stringBufferMapper = null;
+  private Indexed<ByteBuffer> stringDictionary = null;
 
-  private final Long2IntMap longLookup;
+  @Nullable
+  private final DictionaryWriter<Long> longDictionaryWriter;
+  private MappedByteBuffer longBuffer = null;
+  private FixedIndexed<Long> longDictionary = null;
 
-  private final Double2IntMap doubleLookup;
+  @Nullable
+  private final DictionaryWriter<Double> doubleDictionaryWriter;
+  MappedByteBuffer doubleBuffer = null;
+  FixedIndexed<Double> doubleDictionary = null;
 
-  private final Object2IntMap<int[]> arrayLookup;
+  @Nullable
+  private final DictionaryWriter<int[]> arrayDictionaryWriter;
+  private MappedByteBuffer arrayBuffer = null;
+  private FrontCodedIntArrayIndexed arrayDictionary = null;
 
-  private int dictionarySize;
+  public DictionaryIdLookup(
+      String name,
+      @Nullable DictionaryWriter<String> stringDictionaryWriter,
+      @Nullable DictionaryWriter<Long> longDictionaryWriter,
+      @Nullable DictionaryWriter<Double> doubleDictionaryWriter,
+      @Nullable DictionaryWriter<int[]> arrayDictionaryWriter
+  )
+  {
+    this.name = name;
+    this.stringDictionaryWriter = stringDictionaryWriter;
+    this.longDictionaryWriter = longDictionaryWriter;
+    this.doubleDictionaryWriter = doubleDictionaryWriter;
+    this.arrayDictionaryWriter = arrayDictionaryWriter;
+  }
 
-  public DictionaryIdLookup()
+  public int lookupString(@Nullable String value)
   {
-    this.stringLookup = new Object2IntLinkedOpenHashMap<>();
-    stringLookup.defaultReturnValue(-1);
-    this.longLookup = new Long2IntLinkedOpenHashMap();
-    longLookup.defaultReturnValue(-1);
-    this.doubleLookup = new Double2IntLinkedOpenHashMap();
-    doubleLookup.defaultReturnValue(-1);
-    this.arrayLookup = new 
Object2IntAVLTreeMap<>(FrontCodedIntArrayIndexedWriter.ARRAY_COMPARATOR);
-    this.arrayLookup.defaultReturnValue(-1);
+    if (stringDictionary == null) {

Review Comment:
   what metric do you have in mind? Also why 2gb instead of just a metric on 
the size of the components? Also shouldn't such a metric really be much broader 
in scale and not just apply to nested columns? This doesn't feel like either a 
blocker or something that should be resolved in this PR, but agree it does 
sound potentially nice to have a better breakdown on what is driving segment 
size.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] use mmap for nested column value to dictionary id lookup for more chill heap usage during serialization (druid)

Reply via email to