Github user tdas commented on a diff in the pull request:
https://github.com/apache/spark/pull/22042#discussion_r209478033
--- Diff:
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala
---
@@ -251,32 +274,53 @@ private[kafka010] case class InternalKafkaConsumer(
untilOffset: Long,
pollTimeoutMs: Long,
failOnDataLoss: Boolean): ConsumerRecord[Array[Byte], Array[Byte]] =
{
- if (offset != nextOffsetInFetchedData || !fetchedData.hasNext()) {
- // This is the first fetch, or the last pre-fetched data has been
drained.
+ if (offset != nextOffsetInFetchedData) {
+ // This is the first fetch, or the fetched data has been reset.
// Seek to the offset because we may call seekToBeginning or
seekToEnd before this.
seek(offset)
poll(pollTimeoutMs)
+ } else if (!fetchedData.hasNext) {
+ // The last pre-fetched data has been drained.
+ if (offset < offsetAfterPoll) {
+ // Offsets in [offset, offsetAfterPoll) are missing. We should
skip them.
+ resetFetchedData()
+ throw new MissingOffsetException(offset, offsetAfterPoll)
+ } else {
+ seek(offset)
+ poll(pollTimeoutMs)
+ }
}
if (!fetchedData.hasNext()) {
- // We cannot fetch anything after `poll`. Two possible cases:
+ // We cannot fetch anything after `poll`. Three possible cases:
// - `offset` is out of range so that Kafka returns nothing. Just
throw
// `OffsetOutOfRangeException` to let the caller handle it.
// - Cannot fetch any data before timeout. TimeoutException will be
thrown.
+ // - Fetched something but all of them are not valid date messages.
In this case, the position
+ // will be changed and we can use it to determine this case.
val range = getAvailableOffsetRange()
if (offset < range.earliest || offset >= range.latest) {
throw new OffsetOutOfRangeException(
Map(topicPartition -> java.lang.Long.valueOf(offset)).asJava)
- } else {
+ } else if (offsetBeforePoll == offsetAfterPoll) {
throw new TimeoutException(
s"Cannot fetch record for offset $offset in $pollTimeoutMs
milliseconds")
+ } else {
+ assert(offset <= offsetAfterPoll,
+ s"seek to $offset and poll but the offset was reset to
$offsetAfterPoll")
+ throw new MissingOffsetException(offset, offsetAfterPoll)
}
} else {
--- End diff --
Let's remove this else and reduce the condition nesting. The previous `if`
statement always ends in an exception, so we can remove this else.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]