Re: [PR] use mmap for nested column value to dictionary id lookup for more chill heap usage during serialization (druid)

via GitHub Wed, 30 Aug 2023 21:18:44 -0700


clintropolis commented on code in PR #14919:
URL: https://github.com/apache/druid/pull/14919#discussion_r1311070719



##########
processing/src/main/java/org/apache/druid/segment/nested/DictionaryIdLookup.java:
##########
@@ -19,114 +19,308 @@
 
 package org.apache.druid.segment.nested;
 
-import com.google.common.base.Preconditions;
-import it.unimi.dsi.fastutil.doubles.Double2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.doubles.Double2IntMap;
-import it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.longs.Long2IntMap;
-import it.unimi.dsi.fastutil.objects.Object2IntAVLTreeMap;
-import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
-import it.unimi.dsi.fastutil.objects.Object2IntMap;
-import org.apache.druid.segment.data.FrontCodedIntArrayIndexedWriter;
+import com.google.common.primitives.Ints;
+import org.apache.druid.annotations.SuppressFBWarnings;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.java.util.common.ByteBufferUtils;
+import org.apache.druid.java.util.common.FileUtils;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.io.smoosh.FileSmoosher;
+import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
+import org.apache.druid.java.util.common.io.smoosh.SmooshedWriter;
+import org.apache.druid.segment.column.StringEncodingStrategies;
+import org.apache.druid.segment.column.TypeStrategies;
+import org.apache.druid.segment.data.DictionaryWriter;
+import org.apache.druid.segment.data.FixedIndexed;
+import org.apache.druid.segment.data.FrontCodedIntArrayIndexed;
+import org.apache.druid.segment.data.Indexed;
 
 import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.GatheringByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.EnumSet;
 
 /**
- * Ingestion time dictionary identifier lookup, used by {@link 
NestedCommonFormatColumnSerializer} to build a global
- * dictionary id to value mapping for the 'stacked' global value dictionaries.
+ * Value to dictionary id lookup, backed with memory mapped dictionaries 
populated lazily by the supplied
+ * @link DictionaryWriter}.
  */
-public class DictionaryIdLookup
+public final class DictionaryIdLookup implements Closeable
 {
-  private final Object2IntMap<String> stringLookup;
+  private final String name;
+  @Nullable
+  private final DictionaryWriter<String> stringDictionaryWriter;
+  private SmooshedFileMapper stringBufferMapper = null;
+  private Indexed<ByteBuffer> stringDictionary = null;
 
-  private final Long2IntMap longLookup;
+  @Nullable
+  private final DictionaryWriter<Long> longDictionaryWriter;
+  private MappedByteBuffer longBuffer = null;
+  private FixedIndexed<Long> longDictionary = null;
 
-  private final Double2IntMap doubleLookup;
+  @Nullable
+  private final DictionaryWriter<Double> doubleDictionaryWriter;
+  MappedByteBuffer doubleBuffer = null;
+  FixedIndexed<Double> doubleDictionary = null;
 
-  private final Object2IntMap<int[]> arrayLookup;
+  @Nullable
+  private final DictionaryWriter<int[]> arrayDictionaryWriter;
+  private MappedByteBuffer arrayBuffer = null;
+  private FrontCodedIntArrayIndexed arrayDictionary = null;
 
-  private int dictionarySize;
+  public DictionaryIdLookup(
+      String name,
+      @Nullable DictionaryWriter<String> stringDictionaryWriter,
+      @Nullable DictionaryWriter<Long> longDictionaryWriter,
+      @Nullable DictionaryWriter<Double> doubleDictionaryWriter,
+      @Nullable DictionaryWriter<int[]> arrayDictionaryWriter
+  )
+  {
+    this.name = name;
+    this.stringDictionaryWriter = stringDictionaryWriter;
+    this.longDictionaryWriter = longDictionaryWriter;
+    this.doubleDictionaryWriter = doubleDictionaryWriter;
+    this.arrayDictionaryWriter = arrayDictionaryWriter;
+  }
 
-  public DictionaryIdLookup()
+  public int lookupString(@Nullable String value)
   {
-    this.stringLookup = new Object2IntLinkedOpenHashMap<>();
-    stringLookup.defaultReturnValue(-1);
-    this.longLookup = new Long2IntLinkedOpenHashMap();
-    longLookup.defaultReturnValue(-1);
-    this.doubleLookup = new Double2IntLinkedOpenHashMap();
-    doubleLookup.defaultReturnValue(-1);
-    this.arrayLookup = new 
Object2IntAVLTreeMap<>(FrontCodedIntArrayIndexedWriter.ARRAY_COMPARATOR);
-    this.arrayLookup.defaultReturnValue(-1);
+    if (stringDictionary == null) {
+      // GenericIndexed v2 can write to multiple files if the dictionary is 
larger than 2gb, so we use a smooshfile
+      // for strings because of this. if other type dictionary writers could 
potentially use multiple internal files
+      // in the future, we should transition them to using this approach as 
well (or build a combination smoosher and
+      // mapper so that we can have a mutable smoosh)
+      File stringSmoosh = FileUtils.createTempDir(name + "__stringTempSmoosh");
+      final String fileName = 
NestedCommonFormatColumnSerializer.getInternalFileName(
+          name,
+          NestedCommonFormatColumnSerializer.STRING_DICTIONARY_FILE_NAME
+      );
+      final FileSmoosher smoosher = new FileSmoosher(stringSmoosh);
+      try (final SmooshedWriter writer = smoosher.addWithSmooshedWriter(
+          fileName,
+          stringDictionaryWriter.getSerializedSize()
+      )) {
+        stringDictionaryWriter.writeTo(writer, smoosher);
+        writer.close();
+        smoosher.close();
+        stringBufferMapper = SmooshedFileMapper.load(stringSmoosh);
+        final ByteBuffer stringBuffer = stringBufferMapper.mapFile(fileName);
+        stringDictionary = 
StringEncodingStrategies.getStringDictionarySupplier(

Review Comment:
   not sure what the question here is? We currently support 2 encoding 
strategies for string dictionaries, plain utf8 and front-coded which also 
stores utf8 strings, just incrementally encoded. 
https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/segment/column/StringEncodingStrategy.java#L40
   
   which strategy that is used is controlled via the `IndexSpec` 
https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/segment/IndexSpec.java#L84



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] use mmap for nested column value to dictionary id lookup for more chill heap usage during serialization (druid)

Reply via email to