Ma77Ball commented on code in PR #5714:
URL: https://github.com/apache/texera/pull/5714#discussion_r3448005559
##########
amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala:
##########
@@ -375,6 +375,104 @@ class SyncExecutionResource extends LazyLogging {
}
}
+ /**
+ * Blocks until every target operator's default external result port holds
at least as many rows
+ * as its stats report, or until `timeoutMillis` elapses. Operators with no
result storage are
+ * treated as ready.
+ */
+ private def awaitResultsPersisted(
+ executionId: ExecutionIdentity,
+ executionService: org.apache.texera.web.service.WorkflowExecutionService,
+ targetOperatorIds: List[String],
+ timeoutMillis: Long = 2000L,
+ pollIntervalMillis: Long = 25L
+ ): Unit = {
+ def expectedOutputCount(opId: String): Long =
+ expectedDefaultPortOutputCount(
+ executionService.executionStateStore.statsStore.getState,
+ opId
+ )
+
+ def committedCount(opId: String): Option[Long] =
+ committedDefaultPortCount(
+ op =>
+ WorkflowExecutionsResource
+ .getResultUriByLogicalPortId(executionId, OperatorIdentity(op),
PortIdentity()),
+ uri =>
+ DocumentFactory
+ .openDocument(uri)
+ ._1
+ .asInstanceOf[VirtualDocument[Tuple]]
+ .getCount
+ )(opId)
+
+ awaitUntil(
+ targetOperatorIds,
+ expectedOutputCount,
+ committedCount,
+ timeoutMillis,
+ pollIntervalMillis,
+ () => System.currentTimeMillis(),
Review Comment:
Switched to `TimeUnit.NANOSECONDS.toMillis(System.nanoTime())`: monotonic,
NTP-safe, millis unit unchanged.
##########
amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala:
##########
@@ -375,6 +375,104 @@ class SyncExecutionResource extends LazyLogging {
}
}
+ /**
+ * Blocks until every target operator's default external result port holds
at least as many rows
+ * as its stats report, or until `timeoutMillis` elapses. Operators with no
result storage are
+ * treated as ready.
+ */
+ private def awaitResultsPersisted(
+ executionId: ExecutionIdentity,
+ executionService: org.apache.texera.web.service.WorkflowExecutionService,
+ targetOperatorIds: List[String],
+ timeoutMillis: Long = 2000L,
+ pollIntervalMillis: Long = 25L
+ ): Unit = {
+ def expectedOutputCount(opId: String): Long =
+ expectedDefaultPortOutputCount(
+ executionService.executionStateStore.statsStore.getState,
+ opId
+ )
+
+ def committedCount(opId: String): Option[Long] =
+ committedDefaultPortCount(
+ op =>
+ WorkflowExecutionsResource
+ .getResultUriByLogicalPortId(executionId, OperatorIdentity(op),
PortIdentity()),
+ uri =>
+ DocumentFactory
+ .openDocument(uri)
+ ._1
+ .asInstanceOf[VirtualDocument[Tuple]]
+ .getCount
+ )(opId)
+
+ awaitUntil(
+ targetOperatorIds,
+ expectedOutputCount,
+ committedCount,
+ timeoutMillis,
+ pollIntervalMillis,
+ () => System.currentTimeMillis(),
+ Thread.sleep
+ )
+ }
+
+ // Default external output port (PortIdentity()) row count from stats; 0 if
absent.
+ private[resource] def expectedDefaultPortOutputCount(
+ stats: ExecutionStatsStore,
+ opId: String
+ ): Long =
+ stats.operatorInfo
+ .get(opId)
+ .flatMap { metrics =>
+ metrics.operatorStatistics.outputMetrics
+ .find(_.portId == PortIdentity())
+ .map(_.tupleMetrics.count)
+ }
+ .getOrElse(0L)
+
+ // Committed rows for the default result port; None when no storage, 0 when
countOf throws.
+ private[resource] def committedDefaultPortCount(
+ resultUriOf: String => Option[URI],
+ countOf: URI => Long
+ )(opId: String): Option[Long] =
+ resultUriOf(opId).map { uri =>
+ try {
+ countOf(uri)
+ } catch {
+ case _: Exception => 0L
+ }
+ }
+
+ /**
+ * Blocks until every target operator is ready or `timeoutMillis` elapses,
sleeping
+ * `pollIntervalMillis` between checks. An operator is ready when its
expected count is
+ * non-positive, it has no committed count, or its committed count reaches
the expected count.
+ * The clock and sleep are injected so tests can drive timing.
+ */
+ private[resource] def awaitUntil(
+ targetOperatorIds: List[String],
+ expectedCountOf: String => Long,
+ committedCountOf: String => Option[Long],
+ timeoutMillis: Long,
+ pollIntervalMillis: Long,
+ now: () => Long,
+ sleep: Long => Unit
+ ): Unit = {
+ if (targetOperatorIds.isEmpty) return
+
+ def ready: Boolean =
+ targetOperatorIds.forall { opId =>
+ val expected = expectedCountOf(opId)
+ expected <= 0 || committedCountOf(opId).forall(_ >= expected)
+ }
+
+ val deadline = now() + timeoutMillis
+ while (!ready && now() < deadline) {
+ sleep(pollIntervalMillis)
+ }
Review Comment:
Keeping as-is: oversleep is one 25ms interval against the 2s cap, the happy
path never sleeps, and it's a single extra doc-open on the final post-deadline
pass. Not worth restructuring.
##########
amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala:
##########
@@ -375,6 +375,104 @@ class SyncExecutionResource extends LazyLogging {
}
}
+ /**
+ * Blocks until every target operator's default external result port holds
at least as many rows
+ * as its stats report, or until `timeoutMillis` elapses. Operators with no
result storage are
+ * treated as ready.
+ */
+ private def awaitResultsPersisted(
+ executionId: ExecutionIdentity,
+ executionService: org.apache.texera.web.service.WorkflowExecutionService,
+ targetOperatorIds: List[String],
+ timeoutMillis: Long = 2000L,
+ pollIntervalMillis: Long = 25L
+ ): Unit = {
+ def expectedOutputCount(opId: String): Long =
+ expectedDefaultPortOutputCount(
+ executionService.executionStateStore.statsStore.getState,
+ opId
+ )
+
+ def committedCount(opId: String): Option[Long] =
+ committedDefaultPortCount(
+ op =>
+ WorkflowExecutionsResource
+ .getResultUriByLogicalPortId(executionId, OperatorIdentity(op),
PortIdentity()),
+ uri =>
+ DocumentFactory
+ .openDocument(uri)
+ ._1
+ .asInstanceOf[VirtualDocument[Tuple]]
+ .getCount
+ )(opId)
+
+ awaitUntil(
+ targetOperatorIds,
+ expectedOutputCount,
+ committedCount,
+ timeoutMillis,
+ pollIntervalMillis,
+ () => System.currentTimeMillis(),
+ Thread.sleep
+ )
+ }
+
+ // Default external output port (PortIdentity()) row count from stats; 0 if
absent.
+ private[resource] def expectedDefaultPortOutputCount(
+ stats: ExecutionStatsStore,
+ opId: String
+ ): Long =
+ stats.operatorInfo
+ .get(opId)
+ .flatMap { metrics =>
+ metrics.operatorStatistics.outputMetrics
+ .find(_.portId == PortIdentity())
Review Comment:
They converge for the target ops: the output-port metric is bumped in the
same step that enqueues the tuple to that port's storage
(`DataProcessor.scala:185-187`), and targets are exactly the ops with
default-port storage provisioned. No-storage ops return `committed = None`,
treated as ready (vacuous `forall`). A silent writer failure could hold
`committed < expected`, but it's bounded by the 2s cap (old fixed-wait
behavior) and now surfaced by the debug log. No never-satisfies path.
##########
amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala:
##########
@@ -375,6 +375,104 @@ class SyncExecutionResource extends LazyLogging {
}
}
+ /**
+ * Blocks until every target operator's default external result port holds
at least as many rows
+ * as its stats report, or until `timeoutMillis` elapses. Operators with no
result storage are
+ * treated as ready.
+ */
+ private def awaitResultsPersisted(
+ executionId: ExecutionIdentity,
+ executionService: org.apache.texera.web.service.WorkflowExecutionService,
+ targetOperatorIds: List[String],
+ timeoutMillis: Long = 2000L,
+ pollIntervalMillis: Long = 25L
+ ): Unit = {
+ def expectedOutputCount(opId: String): Long =
+ expectedDefaultPortOutputCount(
+ executionService.executionStateStore.statsStore.getState,
+ opId
+ )
+
+ def committedCount(opId: String): Option[Long] =
+ committedDefaultPortCount(
+ op =>
+ WorkflowExecutionsResource
+ .getResultUriByLogicalPortId(executionId, OperatorIdentity(op),
PortIdentity()),
+ uri =>
+ DocumentFactory
+ .openDocument(uri)
+ ._1
+ .asInstanceOf[VirtualDocument[Tuple]]
+ .getCount
+ )(opId)
+
+ awaitUntil(
+ targetOperatorIds,
+ expectedOutputCount,
+ committedCount,
+ timeoutMillis,
+ pollIntervalMillis,
+ () => System.currentTimeMillis(),
+ Thread.sleep
+ )
+ }
+
+ // Default external output port (PortIdentity()) row count from stats; 0 if
absent.
+ private[resource] def expectedDefaultPortOutputCount(
+ stats: ExecutionStatsStore,
+ opId: String
+ ): Long =
+ stats.operatorInfo
+ .get(opId)
+ .flatMap { metrics =>
+ metrics.operatorStatistics.outputMetrics
+ .find(_.portId == PortIdentity())
+ .map(_.tupleMetrics.count)
+ }
+ .getOrElse(0L)
+
+ // Committed rows for the default result port; None when no storage, 0 when
countOf throws.
+ private[resource] def committedDefaultPortCount(
+ resultUriOf: String => Option[URI],
+ countOf: URI => Long
+ )(opId: String): Option[Long] =
+ resultUriOf(opId).map { uri =>
+ try {
+ countOf(uri)
+ } catch {
+ case _: Exception => 0L
Review Comment:
Added a `logger.debug` with opId + URI in the catch; still returns `0L`, so
polling is unchanged.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]