heesung-sn commented on code in PR #20948:
URL: https://github.com/apache/pulsar/pull/20948#discussion_r1302280235


##########
pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/MessageDeduplication.java:
##########
@@ -323,26 +323,29 @@ public MessageDupStatus isDuplicate(PublishContext 
publishContext, ByteBuf heade
 
         String producerName = publishContext.getProducerName();
         long sequenceId = publishContext.getSequenceId();
+        headersAndPayload.markReaderIndex();
+        MessageMetadata msgMetadata = 
Commands.parseMessageMetadata(headersAndPayload);

Review Comment:
   This will potentially increase the complexity of the dedup process because 
now we need to call `Commands.parseMessageMetadata(headersAndPayload);` every 
time. 
   
   Can we use PublishContext.isChunked and parse the metadata only when msg is 
chunked? (and is chunkId only available in metadata?)



##########
pulsar-client/src/main/java/org/apache/pulsar/client/impl/ConsumerImpl.java:
##########
@@ -1449,6 +1449,15 @@ private ByteBuf processMessageChunk(ByteBuf 
compressedPayload, MessageMetadata m
         // discard message if chunk is out-of-order
         if (chunkedMsgCtx == null || chunkedMsgCtx.chunkedMsgBuffer == null
                 || msgMetadata.getChunkId() != 
(chunkedMsgCtx.lastChunkedMessageId + 1)) {
+            // Filter duplicated chunks instead of discard it.
+            if (chunkedMsgCtx == null || msgMetadata.getChunkId() <= 
chunkedMsgCtx.lastChunkedMessageId) {

Review Comment:
   It seems like the above `if` is already handling the duplicated chunks, 
isn't it?
   
   `|| msgMetadata.getChunkId() != (chunkedMsgCtx.lastChunkedMessageId + 1)`



##########
pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/MessageDeduplication.java:
##########
@@ -323,26 +323,29 @@ public MessageDupStatus isDuplicate(PublishContext 
publishContext, ByteBuf heade
 
         String producerName = publishContext.getProducerName();
         long sequenceId = publishContext.getSequenceId();
+        headersAndPayload.markReaderIndex();
+        MessageMetadata msgMetadata = 
Commands.parseMessageMetadata(headersAndPayload);
+        headersAndPayload.resetReaderIndex();
         long highestSequenceId = 
Math.max(publishContext.getHighestSequenceId(), sequenceId);
         if (producerName.startsWith(replicatorPrefix)) {
             // Message is coming from replication, we need to use the original 
producer name and sequence id
             // for the purpose of deduplication and not rely on the 
"replicator" name.
-            int readerIndex = headersAndPayload.readerIndex();
-            MessageMetadata md = 
Commands.parseMessageMetadata(headersAndPayload);
-            producerName = md.getProducerName();
-            sequenceId = md.getSequenceId();
-            highestSequenceId = Math.max(md.getHighestSequenceId(), 
sequenceId);
+            producerName = msgMetadata.getProducerName();
+            sequenceId = msgMetadata.getSequenceId();

Review Comment:
   can we use context.getOriginalProducerName() and 
context.getOriginalSequenceId() if they are passed down here? Then, probably we 
don't need to parse metadata here.



##########
pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/MessageDeduplication.java:
##########
@@ -323,26 +323,29 @@ public MessageDupStatus isDuplicate(PublishContext 
publishContext, ByteBuf heade
 
         String producerName = publishContext.getProducerName();
         long sequenceId = publishContext.getSequenceId();
+        headersAndPayload.markReaderIndex();
+        MessageMetadata msgMetadata = 
Commands.parseMessageMetadata(headersAndPayload);
+        headersAndPayload.resetReaderIndex();
         long highestSequenceId = 
Math.max(publishContext.getHighestSequenceId(), sequenceId);
         if (producerName.startsWith(replicatorPrefix)) {
             // Message is coming from replication, we need to use the original 
producer name and sequence id
             // for the purpose of deduplication and not rely on the 
"replicator" name.
-            int readerIndex = headersAndPayload.readerIndex();
-            MessageMetadata md = 
Commands.parseMessageMetadata(headersAndPayload);
-            producerName = md.getProducerName();
-            sequenceId = md.getSequenceId();
-            highestSequenceId = Math.max(md.getHighestSequenceId(), 
sequenceId);
+            producerName = msgMetadata.getProducerName();
+            sequenceId = msgMetadata.getSequenceId();
+            highestSequenceId = Math.max(msgMetadata.getHighestSequenceId(), 
sequenceId);

Review Comment:
   can we use `publishContext.getHighestSequenceId()`? I think we should 
minimize metadata dependency.



##########
pulsar-broker/src/test/java/org/apache/pulsar/client/impl/MessageChunkingSharedTest.java:
##########
@@ -211,6 +213,84 @@ private static String createChunkedMessage(int numChunks) {
         return Schema.STRING.decode(payload);
     }
 
+    @Test
+    public void testDuplicateForChunkMessage() throws Exception {
+        this.conf.setBrokerDeduplicationEnabled(true);
+        restartBroker();
+        String topicName = 
"persistent://my-property/my-ns/testDuplicateForChunkMessage";
+        String producerName = "test-producer";
+        // consumer
+        Consumer<String> consumer = pulsarClient
+                .newConsumer(Schema.STRING)
+                .subscriptionName("test-sub")
+                .topic(topicName)
+                .subscribe();
+        // producer
+        Producer<String> partProducer = pulsarClient
+                .newProducer(Schema.STRING)
+                .producerName(producerName)
+                .topic(topicName)
+                .enableChunking(true)
+                .enableBatching(false)
+                .create();
+        int messageSize = 6000; // payload size in KB
+        String message = "a".repeat(messageSize * 1000);
+        partProducer.newMessage().value(message).send();
+        Message<String> msg = consumer.receive(5, TimeUnit.SECONDS);
+        assertNotNull(msg);
+        assertTrue(msg.getMessageId() instanceof ChunkMessageIdImpl);
+        assertEquals(msg.getValue(), message);
+
+        Field msgIdGenerator = 
ProducerImpl.class.getDeclaredField("msgIdGenerator");
+        msgIdGenerator.setAccessible(true);
+        assertEquals(msg.getSequenceId() + 1, 
msgIdGenerator.get(partProducer));
+
+        String message2 = "b".repeat(messageSize * 2);
+        partProducer.newMessage().value(message2).send();
+        Message<String> msg2 = consumer.receive(5, TimeUnit.SECONDS);
+        assertFalse(msg2.getMessageId() instanceof ChunkMessageIdImpl);
+        assertEquals(msg2.getValue(), message2);
+
+        long sequenceID = (long) msgIdGenerator.get(partProducer) + 1024L;
+        String message3 = "c".repeat(messageSize * 1000);
+        
partProducer.newMessage().value(message3).sequenceId(sequenceID).send();
+        Message<String> msg3 = consumer.receive(5, TimeUnit.SECONDS);
+        assertNotNull(msg3);
+        assertTrue(msg3.getMessageId() instanceof ChunkMessageIdImpl);
+        assertEquals(msg3.getValue(), message3);
+        assertEquals(msg3.getSequenceId(), sequenceID);
+    }
+
+    @Test
+    public void testDeduplicateChunksInSingleChunkMessages() throws Exception {
+        this.conf.setBrokerDeduplicationEnabled(true);
+        restartBroker();
+        String topicName = 
"persistent://my-property/my-ns/testDeduplicateChunksInSingleChunkMessage";
+        String producerName = "test-producer";
+        // consumer
+        Consumer<String> consumer = pulsarClient
+                .newConsumer(Schema.STRING)
+                .subscriptionName("test-sub")
+                .topic(topicName)
+                .subscribe();
+        final PersistentTopic persistentTopic = (PersistentTopic) 
pulsar.getBrokerService()
+                .getTopicIfExists(topicName).get().orElse(null);
+        assertNotNull(persistentTopic);
+        sendChunk(persistentTopic, "test-producer", 1, 0, 2);

Review Comment:
   producerName



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to