Re: [PR] add metadata ttl field to DedupConfig [pinot]

via GitHub Wed, 14 Aug 2024 16:01:36 -0700


klsince commented on code in PR #13636:
URL: https://github.com/apache/pinot/pull/13636#discussion_r1717638634



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/dedup/ConcurrentMapPartitionDedupMetadataManager.java:
##########
@@ -19,106 +19,99 @@
 package org.apache.pinot.segment.local.dedup;
 
 import com.google.common.annotations.VisibleForTesting;
-import java.util.HashMap;
+import com.google.common.util.concurrent.AtomicDouble;
+import java.io.IOException;
 import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.pinot.common.metrics.ServerGauge;
-import org.apache.pinot.common.metrics.ServerMetrics;
-import org.apache.pinot.segment.local.segment.readers.PinotSegmentColumnReader;
 import org.apache.pinot.segment.local.utils.HashUtils;
 import org.apache.pinot.segment.spi.IndexSegment;
-import org.apache.pinot.spi.config.table.HashFunction;
-import org.apache.pinot.spi.data.readers.PrimaryKey;
-import org.apache.pinot.spi.utils.ByteArray;
 
-class ConcurrentMapPartitionDedupMetadataManager implements 
PartitionDedupMetadataManager {
-  private final String _tableNameWithType;
-  private final List<String> _primaryKeyColumns;
-  private final int _partitionId;
-  private final ServerMetrics _serverMetrics;
-  private final HashFunction _hashFunction;
 
+class ConcurrentMapPartitionDedupMetadataManager extends 
BasePartitionDedupMetadataManager {
   @VisibleForTesting
-  final ConcurrentHashMap<Object, IndexSegment> _primaryKeyToSegmentMap = new 
ConcurrentHashMap<>();
+  final AtomicDouble _largestSeenTime = new AtomicDouble(0);
+  @VisibleForTesting
+  final ConcurrentHashMap<Object, Pair<IndexSegment, Double>> 
_primaryKeyToSegmentAndTimeMap =
+      new ConcurrentHashMap<>();
 
-  public ConcurrentMapPartitionDedupMetadataManager(String tableNameWithType, 
List<String> primaryKeyColumns,
-      int partitionId, ServerMetrics serverMetrics, HashFunction hashFunction) 
{
-    _tableNameWithType = tableNameWithType;
-    _primaryKeyColumns = primaryKeyColumns;
-    _partitionId = partitionId;
-    _serverMetrics = serverMetrics;
-    _hashFunction = hashFunction;
+  protected ConcurrentMapPartitionDedupMetadataManager(String 
tableNameWithType, int partitionId,
+      DedupContext dedupContext) {
+    super(tableNameWithType, partitionId, dedupContext);
   }
 
-  public void addSegment(IndexSegment segment) {
-    // Add all PKs to _primaryKeyToSegmentMap
-    Iterator<PrimaryKey> primaryKeyIterator = getPrimaryKeyIterator(segment);
-    while (primaryKeyIterator.hasNext()) {
-      PrimaryKey pk = primaryKeyIterator.next();
-      _primaryKeyToSegmentMap.put(HashUtils.hashPrimaryKey(pk, _hashFunction), 
segment);
+  @Override
+  protected void doAddSegment(IndexSegment segment, Iterator<DedupRecordInfo> 
dedupRecordInfoIterator) {
+    while (dedupRecordInfoIterator.hasNext()) {
+      DedupRecordInfo dedupRecordInfo = dedupRecordInfoIterator.next();
+      double dedupTime = dedupRecordInfo.getDedupTime();
+      _largestSeenTime.getAndUpdate(time -> Math.max(time, dedupTime));
+      
_primaryKeyToSegmentAndTimeMap.compute(HashUtils.hashPrimaryKey(dedupRecordInfo.getPrimaryKey(),
 _hashFunction),

Review Comment:
   perhaps add a comment or perhaps emit a WARN log in the end of adding 
segments that segment has contained PKs duplicate with existing segments



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/upsert/UpsertUtils.java:
##########
@@ -141,48 +141,10 @@ public void close()
         throws IOException {
       _primaryKeyReader.close();
       _comparisonColumnReader.close();
-    }
-  }
-
-  public static class PrimaryKeyReader implements Closeable {
-    public final List<PinotSegmentColumnReader> _primaryKeyColumnReaders;
-
-    public PrimaryKeyReader(IndexSegment segment, List<String> 
primaryKeyColumns) {
-      _primaryKeyColumnReaders = new ArrayList<>(primaryKeyColumns.size());
-      for (String primaryKeyColumn : primaryKeyColumns) {
-        _primaryKeyColumnReaders.add(new PinotSegmentColumnReader(segment, 
primaryKeyColumn));
-      }
-    }
-
-    public PrimaryKey getPrimaryKey(int docId) {
-      int numPrimaryKeys = _primaryKeyColumnReaders.size();
-      Object[] values = new Object[numPrimaryKeys];
-      for (int i = 0; i < numPrimaryKeys; i++) {
-        values[i] = getValue(_primaryKeyColumnReaders.get(i), docId);
-      }
-      return new PrimaryKey(values);
-    }
-
-    public void getPrimaryKey(int docId, PrimaryKey buffer) {
-      Object[] values = buffer.getValues();
-      int numPrimaryKeys = values.length;
-      for (int i = 0; i < numPrimaryKeys; i++) {
-        values[i] = getValue(_primaryKeyColumnReaders.get(i), docId);
+      if (_deleteRecordColumnReader != null) {
+        _deleteRecordColumnReader.close();

Review Comment:
   was this missed? good catch.



##########
pinot-spi/src/main/java/org/apache/pinot/spi/config/table/DedupConfig.java:
##########
@@ -20,25 +20,45 @@
 
 import com.fasterxml.jackson.annotation.JsonCreator;
 import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonPropertyDescription;
+import java.util.Map;
 import org.apache.pinot.spi.config.BaseJsonConfig;
 
 public class DedupConfig extends BaseJsonConfig {
+  @JsonPropertyDescription("Whether dedup is enabled or not.")
   private final boolean _dedupEnabled;
+  @JsonPropertyDescription("Function to hash the primary key.")
   private final HashFunction _hashFunction;
+  @JsonPropertyDescription("Custom class for dedup metadata manager.")
   private final String _metadataManagerClass;
+  @JsonPropertyDescription("Custom configs for dedup metadata manager")
+  private final Map<String, String> _metadataManagerConfigs;
+  @JsonPropertyDescription("When larger than 0, use it for dedup metadata 
cleanup, it uses the same unit as the "
+      + "metadata time column. The metadata will be cleaned up when the 
metadata time is older than the current time "

Review Comment:
   as to 'metadata time column', did you mean the dedupTimeColumn? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] add metadata ttl field to DedupConfig [pinot]

Reply via email to