Re: [PR] Enhance Kinesis consumer [pinot]

via GitHub Mon, 08 Apr 2024 15:13:57 -0700


Jackie-Jiang commented on code in PR #12806:
URL: https://github.com/apache/pinot/pull/12806#discussion_r1556480965



##########
pinot-plugins/pinot-stream-ingestion/pinot-kinesis/src/main/java/org/apache/pinot/plugin/stream/kinesis/KinesisConsumer.java:
##########
@@ -69,113 +60,56 @@ public KinesisConsumer(KinesisConfig config, KinesisClient 
kinesisClient) {
     super(config, kinesisClient);
   }
 
-  /**
-   * Fetch records from the Kinesis stream between the start and end 
KinesisCheckpoint
-   */
   @Override
-  public KinesisMessageBatch fetchMessages(StreamPartitionMsgOffset 
startMsgOffset, int timeoutMs) {
+  public synchronized KinesisMessageBatch 
fetchMessages(StreamPartitionMsgOffset startMsgOffset, int timeoutMs) {
     KinesisPartitionGroupOffset startOffset = (KinesisPartitionGroupOffset) 
startMsgOffset;
-    List<BytesStreamMessage> messages = new ArrayList<>();
-    Future<KinesisMessageBatch> kinesisFetchResultFuture =
-        _executorService.submit(() -> getResult(startOffset, messages));
-    try {
-      return kinesisFetchResultFuture.get(timeoutMs, TimeUnit.MILLISECONDS);
-    } catch (TimeoutException e) {
-      kinesisFetchResultFuture.cancel(true);
-    } catch (Exception e) {
-      // Ignored
-    }
-    return buildKinesisMessageBatch(startOffset, messages, false);
-  }
-
-  private KinesisMessageBatch getResult(KinesisPartitionGroupOffset 
startOffset, List<BytesStreamMessage> messages) {
-    try {
-      String shardId = startOffset.getShardId();
-      String shardIterator = getShardIterator(shardId, 
startOffset.getSequenceNumber());
-      boolean endOfShard = false;
-      long currentWindow = System.currentTimeMillis() / 
SLEEP_TIME_BETWEEN_REQUESTS;
-      int currentWindowRequests = 0;
-      while (shardIterator != null) {
-        GetRecordsRequest getRecordsRequest = 
GetRecordsRequest.builder().shardIterator(shardIterator).build();
-        long requestSentTime = System.currentTimeMillis() / 1000;
-        GetRecordsResponse getRecordsResponse = 
_kinesisClient.getRecords(getRecordsRequest);
-        List<Record> records = getRecordsResponse.records();
-        if (!records.isEmpty()) {
-          for (Record record : records) {
-            messages.add(extractStreamMessage(record, shardId));
-          }
-          if (messages.size() >= _config.getNumMaxRecordsToFetch()) {
-            break;
-          }
+    String shardId = startOffset.getShardId();
+    String startSequenceNumber = startOffset.getSequenceNumber();
+
+    // NOTE: Kinesis enforces a limit of 5 getRecords request per second on 
each shard from AWS end, beyond which we
+    //       start getting ProvisionedThroughputExceededException. Rate limit 
the requests to avoid this.
+    long currentTimeMs = System.currentTimeMillis();
+    int currentTimeSeconds = (int) 
TimeUnit.MILLISECONDS.toSeconds(currentTimeMs);
+    if (currentTimeSeconds == _currentSecond) {
+      if (_numRequestsInCurrentSecond == _config.getRpsLimit()) {
+        try {
+          Thread.sleep(1000 - (currentTimeMs % 1000));
+        } catch (InterruptedException e) {
+          throw new RuntimeException(e);
         }
-
-        if (getRecordsResponse.hasChildShards() && 
!getRecordsResponse.childShards().isEmpty()) {
-          //This statement returns true only when end of current shard has 
reached.
-          // hasChildShards only checks if the childShard is null and is a 
valid instance.
-          endOfShard = true;
-          break;
-        }
-
-        shardIterator = getRecordsResponse.nextShardIterator();
-
-        if (Thread.interrupted()) {
-          break;
-        }
-
-        // Kinesis enforces a limit of 5 .getRecords request per second on 
each shard from AWS end
-        // Beyond this limit we start getting 
ProvisionedThroughputExceededException which affect the ingestion
-        if (requestSentTime == currentWindow) {
-          currentWindowRequests++;
-        } else if (requestSentTime > currentWindow) {
-          currentWindow = requestSentTime;
-          currentWindowRequests = 0;
-        }
-
-        if (currentWindowRequests >= _config.getNumMaxRecordsToFetch()) {
-          try {
-            Thread.sleep(SLEEP_TIME_BETWEEN_REQUESTS);
-          } catch (InterruptedException e) {
-            LOGGER.debug("Sleep interrupted while rate limiting Kinesis 
requests", e);
-            break;
-          }
-        }
-      }
-
-      return buildKinesisMessageBatch(startOffset, messages, endOfShard);
-    } catch (IllegalStateException e) {
-      debugOrLogWarning("Illegal state exception, connection is broken", e);
-    } catch (ProvisionedThroughputExceededException e) {
-      debugOrLogWarning("The request rate for the stream is too high", e);
-    } catch (ExpiredIteratorException e) {
-      debugOrLogWarning("ShardIterator expired while trying to fetch records", 
e);
-    } catch (ResourceNotFoundException | InvalidArgumentException e) {
-      // aws errors
-      LOGGER.error("Encountered AWS error while attempting to fetch records", 
e);
-    } catch (KinesisException e) {
-      debugOrLogWarning("Encountered unknown unrecoverable AWS exception", e);
-      throw new RuntimeException(e);
-    } catch (AbortedException e) {
-      if (!(e.getCause() instanceof InterruptedException)) {
-        debugOrLogWarning("Task aborted due to exception", e);
+        _currentSecond++;
+        _numRequestsInCurrentSecond = 1;
+      } else {
+        _numRequestsInCurrentSecond++;
       }
-    } catch (Throwable e) {
-      // non transient errors
-      LOGGER.error("Unknown fetchRecords exception", e);
-      throw new RuntimeException(e);
+    } else {
+      _currentSecond = currentTimeSeconds;
+      _numRequestsInCurrentSecond = 1;
     }
-    return buildKinesisMessageBatch(startOffset, messages, false);
-  }
 
-  private void debugOrLogWarning(String message, Throwable throwable) {
-    if (LOGGER.isDebugEnabled()) {
-      LOGGER.debug(message, throwable);
+    // Get the shard iterator
+    String shardIterator;
+    if (startSequenceNumber.equals(_nextStartSequenceNumber)) {
+      shardIterator = _nextShardIterator;
     } else {
-      LOGGER.warn(message + ": " + throwable.getMessage());
+      // TODO: Revisit this logic to see if we always miss the first message 
when consuming from a new shard

Review Comment:
   Ack



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Enhance Kinesis consumer [pinot]

Reply via email to