zhtaoxiang commented on code in PR #13636:
URL: https://github.com/apache/pinot/pull/13636#discussion_r1718826384
##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/dedup/ConcurrentMapPartitionDedupMetadataManager.java:
##########
@@ -19,106 +19,99 @@
package org.apache.pinot.segment.local.dedup;
import com.google.common.annotations.VisibleForTesting;
-import java.util.HashMap;
+import com.google.common.util.concurrent.AtomicDouble;
+import java.io.IOException;
import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import org.apache.commons.lang3.tuple.Pair;
import org.apache.pinot.common.metrics.ServerGauge;
-import org.apache.pinot.common.metrics.ServerMetrics;
-import org.apache.pinot.segment.local.segment.readers.PinotSegmentColumnReader;
import org.apache.pinot.segment.local.utils.HashUtils;
import org.apache.pinot.segment.spi.IndexSegment;
-import org.apache.pinot.spi.config.table.HashFunction;
-import org.apache.pinot.spi.data.readers.PrimaryKey;
-import org.apache.pinot.spi.utils.ByteArray;
-class ConcurrentMapPartitionDedupMetadataManager implements
PartitionDedupMetadataManager {
- private final String _tableNameWithType;
- private final List<String> _primaryKeyColumns;
- private final int _partitionId;
- private final ServerMetrics _serverMetrics;
- private final HashFunction _hashFunction;
+class ConcurrentMapPartitionDedupMetadataManager extends
BasePartitionDedupMetadataManager {
@VisibleForTesting
- final ConcurrentHashMap<Object, IndexSegment> _primaryKeyToSegmentMap = new
ConcurrentHashMap<>();
+ final AtomicDouble _largestSeenTime = new AtomicDouble(0);
+ @VisibleForTesting
+ final ConcurrentHashMap<Object, Pair<IndexSegment, Double>>
_primaryKeyToSegmentAndTimeMap =
+ new ConcurrentHashMap<>();
- public ConcurrentMapPartitionDedupMetadataManager(String tableNameWithType,
List<String> primaryKeyColumns,
- int partitionId, ServerMetrics serverMetrics, HashFunction hashFunction)
{
- _tableNameWithType = tableNameWithType;
- _primaryKeyColumns = primaryKeyColumns;
- _partitionId = partitionId;
- _serverMetrics = serverMetrics;
- _hashFunction = hashFunction;
+ protected ConcurrentMapPartitionDedupMetadataManager(String
tableNameWithType, int partitionId,
+ DedupContext dedupContext) {
+ super(tableNameWithType, partitionId, dedupContext);
}
- public void addSegment(IndexSegment segment) {
- // Add all PKs to _primaryKeyToSegmentMap
- Iterator<PrimaryKey> primaryKeyIterator = getPrimaryKeyIterator(segment);
- while (primaryKeyIterator.hasNext()) {
- PrimaryKey pk = primaryKeyIterator.next();
- _primaryKeyToSegmentMap.put(HashUtils.hashPrimaryKey(pk, _hashFunction),
segment);
+ @Override
+ protected void doAddSegment(IndexSegment segment, Iterator<DedupRecordInfo>
dedupRecordInfoIterator) {
+ while (dedupRecordInfoIterator.hasNext()) {
+ DedupRecordInfo dedupRecordInfo = dedupRecordInfoIterator.next();
+ double dedupTime = dedupRecordInfo.getDedupTime();
+ _largestSeenTime.getAndUpdate(time -> Math.max(time, dedupTime));
+
_primaryKeyToSegmentAndTimeMap.compute(HashUtils.hashPrimaryKey(dedupRecordInfo.getPrimaryKey(),
_hashFunction),
Review Comment:
The default implementation of the replaceSegment method in the previous
change removes old segment first, then adds the new segment.
This implementation has a problem: this replacement logic is not atomic.
Specifically, just after the old segment is removed and before the replaced
segment is added, a record with a primary key that is contained in both and old
and the new segments can be added successfully - this is a duplication.
This is fixed in
https://github.com/apache/pinot/pull/13636/commits/eb4065fb406302fb6b21275fcb193494fd7c27da
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]