heesung-sn commented on code in PR #20948:
URL: https://github.com/apache/pulsar/pull/20948#discussion_r1302280235
##########
pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/MessageDeduplication.java:
##########
@@ -323,26 +323,29 @@ public MessageDupStatus isDuplicate(PublishContext
publishContext, ByteBuf heade
String producerName = publishContext.getProducerName();
long sequenceId = publishContext.getSequenceId();
+ headersAndPayload.markReaderIndex();
+ MessageMetadata msgMetadata =
Commands.parseMessageMetadata(headersAndPayload);
Review Comment:
This will potentially increase the complexity of the dedup process because
now we need to call `Commands.parseMessageMetadata(headersAndPayload);` every
time.
Can we use PublishContext.isChunked and parse the metadata only when msg is
chunked? (and is chunkId only available in metadata?)
##########
pulsar-client/src/main/java/org/apache/pulsar/client/impl/ConsumerImpl.java:
##########
@@ -1449,6 +1449,15 @@ private ByteBuf processMessageChunk(ByteBuf
compressedPayload, MessageMetadata m
// discard message if chunk is out-of-order
if (chunkedMsgCtx == null || chunkedMsgCtx.chunkedMsgBuffer == null
|| msgMetadata.getChunkId() !=
(chunkedMsgCtx.lastChunkedMessageId + 1)) {
+ // Filter duplicated chunks instead of discard it.
+ if (chunkedMsgCtx == null || msgMetadata.getChunkId() <=
chunkedMsgCtx.lastChunkedMessageId) {
Review Comment:
It seems like the above `if` is already handling the duplicated chunks,
isn't it?
`|| msgMetadata.getChunkId() != (chunkedMsgCtx.lastChunkedMessageId + 1)`
##########
pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/MessageDeduplication.java:
##########
@@ -323,26 +323,29 @@ public MessageDupStatus isDuplicate(PublishContext
publishContext, ByteBuf heade
String producerName = publishContext.getProducerName();
long sequenceId = publishContext.getSequenceId();
+ headersAndPayload.markReaderIndex();
+ MessageMetadata msgMetadata =
Commands.parseMessageMetadata(headersAndPayload);
+ headersAndPayload.resetReaderIndex();
long highestSequenceId =
Math.max(publishContext.getHighestSequenceId(), sequenceId);
if (producerName.startsWith(replicatorPrefix)) {
// Message is coming from replication, we need to use the original
producer name and sequence id
// for the purpose of deduplication and not rely on the
"replicator" name.
- int readerIndex = headersAndPayload.readerIndex();
- MessageMetadata md =
Commands.parseMessageMetadata(headersAndPayload);
- producerName = md.getProducerName();
- sequenceId = md.getSequenceId();
- highestSequenceId = Math.max(md.getHighestSequenceId(),
sequenceId);
+ producerName = msgMetadata.getProducerName();
+ sequenceId = msgMetadata.getSequenceId();
Review Comment:
can we use context.getOriginalProducerName() and
context.getOriginalSequenceId() if they are passed down here? Then, probably we
don't need to parse metadata here.
##########
pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/MessageDeduplication.java:
##########
@@ -323,26 +323,29 @@ public MessageDupStatus isDuplicate(PublishContext
publishContext, ByteBuf heade
String producerName = publishContext.getProducerName();
long sequenceId = publishContext.getSequenceId();
+ headersAndPayload.markReaderIndex();
+ MessageMetadata msgMetadata =
Commands.parseMessageMetadata(headersAndPayload);
+ headersAndPayload.resetReaderIndex();
long highestSequenceId =
Math.max(publishContext.getHighestSequenceId(), sequenceId);
if (producerName.startsWith(replicatorPrefix)) {
// Message is coming from replication, we need to use the original
producer name and sequence id
// for the purpose of deduplication and not rely on the
"replicator" name.
- int readerIndex = headersAndPayload.readerIndex();
- MessageMetadata md =
Commands.parseMessageMetadata(headersAndPayload);
- producerName = md.getProducerName();
- sequenceId = md.getSequenceId();
- highestSequenceId = Math.max(md.getHighestSequenceId(),
sequenceId);
+ producerName = msgMetadata.getProducerName();
+ sequenceId = msgMetadata.getSequenceId();
+ highestSequenceId = Math.max(msgMetadata.getHighestSequenceId(),
sequenceId);
Review Comment:
can we use `publishContext.getHighestSequenceId()`? I think we should
minimize metadata dependency.
##########
pulsar-broker/src/test/java/org/apache/pulsar/client/impl/MessageChunkingSharedTest.java:
##########
@@ -211,6 +213,84 @@ private static String createChunkedMessage(int numChunks) {
return Schema.STRING.decode(payload);
}
+ @Test
+ public void testDuplicateForChunkMessage() throws Exception {
+ this.conf.setBrokerDeduplicationEnabled(true);
+ restartBroker();
+ String topicName =
"persistent://my-property/my-ns/testDuplicateForChunkMessage";
+ String producerName = "test-producer";
+ // consumer
+ Consumer<String> consumer = pulsarClient
+ .newConsumer(Schema.STRING)
+ .subscriptionName("test-sub")
+ .topic(topicName)
+ .subscribe();
+ // producer
+ Producer<String> partProducer = pulsarClient
+ .newProducer(Schema.STRING)
+ .producerName(producerName)
+ .topic(topicName)
+ .enableChunking(true)
+ .enableBatching(false)
+ .create();
+ int messageSize = 6000; // payload size in KB
+ String message = "a".repeat(messageSize * 1000);
+ partProducer.newMessage().value(message).send();
+ Message<String> msg = consumer.receive(5, TimeUnit.SECONDS);
+ assertNotNull(msg);
+ assertTrue(msg.getMessageId() instanceof ChunkMessageIdImpl);
+ assertEquals(msg.getValue(), message);
+
+ Field msgIdGenerator =
ProducerImpl.class.getDeclaredField("msgIdGenerator");
+ msgIdGenerator.setAccessible(true);
+ assertEquals(msg.getSequenceId() + 1,
msgIdGenerator.get(partProducer));
+
+ String message2 = "b".repeat(messageSize * 2);
+ partProducer.newMessage().value(message2).send();
+ Message<String> msg2 = consumer.receive(5, TimeUnit.SECONDS);
+ assertFalse(msg2.getMessageId() instanceof ChunkMessageIdImpl);
+ assertEquals(msg2.getValue(), message2);
+
+ long sequenceID = (long) msgIdGenerator.get(partProducer) + 1024L;
+ String message3 = "c".repeat(messageSize * 1000);
+
partProducer.newMessage().value(message3).sequenceId(sequenceID).send();
+ Message<String> msg3 = consumer.receive(5, TimeUnit.SECONDS);
+ assertNotNull(msg3);
+ assertTrue(msg3.getMessageId() instanceof ChunkMessageIdImpl);
+ assertEquals(msg3.getValue(), message3);
+ assertEquals(msg3.getSequenceId(), sequenceID);
+ }
+
+ @Test
+ public void testDeduplicateChunksInSingleChunkMessages() throws Exception {
+ this.conf.setBrokerDeduplicationEnabled(true);
+ restartBroker();
+ String topicName =
"persistent://my-property/my-ns/testDeduplicateChunksInSingleChunkMessage";
+ String producerName = "test-producer";
+ // consumer
+ Consumer<String> consumer = pulsarClient
+ .newConsumer(Schema.STRING)
+ .subscriptionName("test-sub")
+ .topic(topicName)
+ .subscribe();
+ final PersistentTopic persistentTopic = (PersistentTopic)
pulsar.getBrokerService()
+ .getTopicIfExists(topicName).get().orElse(null);
+ assertNotNull(persistentTopic);
+ sendChunk(persistentTopic, "test-producer", 1, 0, 2);
Review Comment:
producerName
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]