snleee commented on code in PR #10083:
URL: https://github.com/apache/pinot/pull/10083#discussion_r1065359235
##########
pinot-common/src/main/java/org/apache/pinot/common/metrics/AbstractMetrics.java:
##########
@@ -333,9 +333,7 @@ public PinotMeter getMeteredTableValue(final String
tableName, final M meter) {
* @param unitCount The number of units to add to the gauge
*/
public void addValueToTableGauge(final String tableName, final G gauge,
final long unitCount) {
- final String fullGaugeName;
Review Comment:
+1 on this change
##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java:
##########
@@ -71,42 +78,105 @@ protected final void runTask(Properties
periodicTaskProperties) {
_helixTaskResourceManager.getTaskMetadataLastUpdateTimeMs();
taskMetadataLastUpdateTime.forEach((tableNameWithType,
taskTypeLastUpdateTime) ->
taskTypeLastUpdateTime.forEach((taskType, lastUpdateTimeMs) ->
- _controllerMetrics.addOrUpdateGauge(
-
ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE.getGaugeName() +
"."
- + tableNameWithType + "." + taskType, () ->
System.currentTimeMillis() - lastUpdateTimeMs)));
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType,
taskType,
+ ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE,
+ () -> System.currentTimeMillis() - lastUpdateTimeMs)));
// The call to get task types can take time if there are a lot of tasks.
// Potential optimization is to call it every (say) 30m if we detect a
barrage of
// zk requests.
Set<String> taskTypes = _helixTaskResourceManager.getTaskTypes();
for (String taskType : taskTypes) {
- TaskCount accumulated = new TaskCount();
+ TaskCount taskTypeAccumulatedCount = new TaskCount();
+ Map<String, TaskCount> tableAccumulatedCount = new HashMap<>();
try {
Set<String> tasksInProgress =
_helixTaskResourceManager.getTasksInProgress(taskType);
final int numRunningTasks = tasksInProgress.size();
for (String task : tasksInProgress) {
- TaskCount taskCount = _helixTaskResourceManager.getTaskCount(task);
- accumulated.accumulate(taskCount);
+ Map<String, TaskCount> tableTaskCount =
_helixTaskResourceManager.getTableTaskCount(task);
+ tableTaskCount.forEach((tableNameWithType, taskCount) -> {
+ taskTypeAccumulatedCount.accumulate(taskCount);
+ tableAccumulatedCount.compute(tableNameWithType, (name, count) -> {
+ if (count == null) {
+ count = new TaskCount();
+ }
+ count.accumulate(taskCount);
+ return count;
+ });
+ });
}
// Emit metrics for taskType.
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS,
taskType,
numRunningTasks);
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_RUNNING,
taskType,
- accumulated.getRunning());
+ taskTypeAccumulatedCount.getRunning());
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
taskType,
- accumulated.getWaiting());
+ taskTypeAccumulatedCount.getWaiting());
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ERROR,
taskType,
- accumulated.getError());
- int total = accumulated.getTotal();
- int percent = total != 0 ? (accumulated.getWaiting() +
accumulated.getRunning()) * 100 / total : 0;
+ taskTypeAccumulatedCount.getError());
+ int total = taskTypeAccumulatedCount.getTotal();
+ int percent = total != 0
+ ? (taskTypeAccumulatedCount.getWaiting() +
taskTypeAccumulatedCount.getRunning()) * 100 / total : 0;
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE,
taskType, percent);
- percent = total != 0 ? accumulated.getError() * 100 / total : 0;
+ percent = total != 0 ? taskTypeAccumulatedCount.getError() * 100 /
total : 0;
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR,
taskType, percent);
+
+ // Emit metrics for table taskType
+ tableAccumulatedCount.forEach((tableNameWithType, taskCount) -> {
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_RUNNING, () -> (long)
taskCount.getRunning());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
taskCount.getWaiting());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError());
+ int tableTotal = taskCount.getTotal();
+ int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() +
taskCount.getRunning()) * 100 / tableTotal : 0;
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE, tablePercent);
+ tablePercent = tableTotal != 0 ? taskCount.getError() * 100 /
tableTotal : 0;
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR, tablePercent);
+ });
+
+
+ if (_preReportedTables.containsKey(taskType)) {
+ Set<String> tableNameWithTypeSet = _preReportedTables.get(taskType);
+ tableNameWithTypeSet.removeAll(tableAccumulatedCount.keySet());
+ removeTableTaskTypeMetrics(tableNameWithTypeSet, taskType);
+ }
+ if (!tableAccumulatedCount.isEmpty()) {
+ // need to make a copy of the set because we may want to chagne the
set later
+ Set<String> tableNameWithTypeSet = new
HashSet<>(tableAccumulatedCount.keySet());
+ _preReportedTables.put(taskType, tableNameWithTypeSet);
+ } else {
+ _preReportedTables.remove(taskType);
+ }
} catch (Exception e) {
LOGGER.error("Caught exception while getting metrics for task type
{}", taskType, e);
}
}
+ // clean up metrics for task types that have already been removed
+ _preReportedTaskTypes.removeAll(taskTypes);
+ for (String taskType : _preReportedTaskTypes) {
Review Comment:
@zhtaoxiang We didn't clean up the metrics when the table is deleted?
##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java:
##########
@@ -71,42 +78,105 @@ protected final void runTask(Properties
periodicTaskProperties) {
_helixTaskResourceManager.getTaskMetadataLastUpdateTimeMs();
taskMetadataLastUpdateTime.forEach((tableNameWithType,
taskTypeLastUpdateTime) ->
taskTypeLastUpdateTime.forEach((taskType, lastUpdateTimeMs) ->
- _controllerMetrics.addOrUpdateGauge(
-
ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE.getGaugeName() +
"."
- + tableNameWithType + "." + taskType, () ->
System.currentTimeMillis() - lastUpdateTimeMs)));
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType,
taskType,
+ ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE,
+ () -> System.currentTimeMillis() - lastUpdateTimeMs)));
// The call to get task types can take time if there are a lot of tasks.
// Potential optimization is to call it every (say) 30m if we detect a
barrage of
// zk requests.
Set<String> taskTypes = _helixTaskResourceManager.getTaskTypes();
for (String taskType : taskTypes) {
- TaskCount accumulated = new TaskCount();
+ TaskCount taskTypeAccumulatedCount = new TaskCount();
+ Map<String, TaskCount> tableAccumulatedCount = new HashMap<>();
try {
Set<String> tasksInProgress =
_helixTaskResourceManager.getTasksInProgress(taskType);
final int numRunningTasks = tasksInProgress.size();
for (String task : tasksInProgress) {
- TaskCount taskCount = _helixTaskResourceManager.getTaskCount(task);
- accumulated.accumulate(taskCount);
+ Map<String, TaskCount> tableTaskCount =
_helixTaskResourceManager.getTableTaskCount(task);
+ tableTaskCount.forEach((tableNameWithType, taskCount) -> {
+ taskTypeAccumulatedCount.accumulate(taskCount);
+ tableAccumulatedCount.compute(tableNameWithType, (name, count) -> {
+ if (count == null) {
+ count = new TaskCount();
+ }
+ count.accumulate(taskCount);
+ return count;
+ });
+ });
}
// Emit metrics for taskType.
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS,
taskType,
numRunningTasks);
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_RUNNING,
taskType,
- accumulated.getRunning());
+ taskTypeAccumulatedCount.getRunning());
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
taskType,
- accumulated.getWaiting());
+ taskTypeAccumulatedCount.getWaiting());
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ERROR,
taskType,
- accumulated.getError());
- int total = accumulated.getTotal();
- int percent = total != 0 ? (accumulated.getWaiting() +
accumulated.getRunning()) * 100 / total : 0;
+ taskTypeAccumulatedCount.getError());
+ int total = taskTypeAccumulatedCount.getTotal();
+ int percent = total != 0
+ ? (taskTypeAccumulatedCount.getWaiting() +
taskTypeAccumulatedCount.getRunning()) * 100 / total : 0;
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE,
taskType, percent);
- percent = total != 0 ? accumulated.getError() * 100 / total : 0;
+ percent = total != 0 ? taskTypeAccumulatedCount.getError() * 100 /
total : 0;
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR,
taskType, percent);
+
+ // Emit metrics for table taskType
+ tableAccumulatedCount.forEach((tableNameWithType, taskCount) -> {
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_RUNNING, () -> (long)
taskCount.getRunning());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
taskCount.getWaiting());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError());
+ int tableTotal = taskCount.getTotal();
+ int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() +
taskCount.getRunning()) * 100 / tableTotal : 0;
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE, tablePercent);
+ tablePercent = tableTotal != 0 ? taskCount.getError() * 100 /
tableTotal : 0;
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR, tablePercent);
+ });
+
Review Comment:
remove line
##########
pinot-common/src/main/java/org/apache/pinot/common/metrics/AbstractMetrics.java:
##########
@@ -529,11 +517,9 @@ public void initializeGlobalMeters() {
}
}
+ @Deprecated
Review Comment:
Let's add the comment to guide on which function to use instead of
deprecated functions. (applies to all `@Deprecated` annotations)
##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java:
##########
@@ -364,7 +366,46 @@ public synchronized TaskCount getTaskCount(String
parentTaskName) {
}
/**
- * Returns a set of Task names (in the form "Task_TestTask_1624403781879")
that are in progress or not started yet.
+ * This method returns a map of table name to count of sub-tasks in various
states, given the top-level task name.
+ * @param parentTaskName in the form "Task_<taskType>_<uuid>_<timestamp>"
+ * @return a map of table name to {@link TaskCount}
+ */
+ public synchronized Map<String, TaskCount> getTableTaskCount(String
parentTaskName) {
+ Map<String, TaskPartitionState> subtaskStates =
getSubtaskStates(parentTaskName);
+ if (subtaskStates.isEmpty()) {
+ return Collections.emptyMap();
+ }
+
+ JobConfig jobConfig =
_taskDriver.getJobConfig(getHelixJobName(parentTaskName));
+ // in theory, this should not happen because we have already checked
JobContext
+ if (jobConfig == null) {
+ return Collections.emptyMap();
+ }
+
+ Map<String, TaskCount> tableTaskCountMap = new HashMap<>();
+ subtaskStates.forEach((taskId, taskState) -> {
+ TaskConfig taskConfig = jobConfig.getTaskConfig(taskId);
+ String tableNameWithType;
+ // in theory, this should not happen because jobContext has this taskId
+ if (taskConfig == null) {
+ tableNameWithType = UNKNOWN_TABLE_NAME;
Review Comment:
Should we at least add the log for this if this is something that should not
be happening?
##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java:
##########
@@ -364,7 +366,46 @@ public synchronized TaskCount getTaskCount(String
parentTaskName) {
}
/**
- * Returns a set of Task names (in the form "Task_TestTask_1624403781879")
that are in progress or not started yet.
+ * This method returns a map of table name to count of sub-tasks in various
states, given the top-level task name.
+ * @param parentTaskName in the form "Task_<taskType>_<uuid>_<timestamp>"
Review Comment:
I guess that we are already adding `uuid` and this is just an improvement in
terms of documentation?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]