[
https://issues.apache.org/jira/browse/KUDU-2638?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16724859#comment-16724859
]
jiaqiyang commented on KUDU-2638:
---------------------------------
in this code :
{code:java}
// code placeholder
MaintenanceOp* MaintenanceManager::FindBestOp() {
TRACE_EVENT0("maintenance", "MaintenanceManager::FindBestOp");
size_t free_threads = num_threads_ - running_ops_;
if (free_threads == 0) {
VLOG_AND_TRACE("maintenance", 1) << LogPrefix()
<< "There are no free threads, so we can't
run anything.";
return nullptr;
}
int64_t low_io_most_logs_retained_bytes = 0;
MaintenanceOp* low_io_most_logs_retained_bytes_op = nullptr;
uint64_t most_mem_anchored = 0;
MaintenanceOp* most_mem_anchored_op = nullptr;
int64_t most_logs_retained_bytes = 0;
int64_t most_logs_retained_bytes_ram_anchored = 0;
MaintenanceOp* most_logs_retained_bytes_op = nullptr;
int64_t most_data_retained_bytes = 0;
MaintenanceOp* most_data_retained_bytes_op = nullptr;
double best_perf_improvement = 0;
MaintenanceOp* best_perf_improvement_op = nullptr;
for (OpMapTy::value_type &val : ops_) {
MaintenanceOp* op(val.first);
MaintenanceOpStats& stats(val.second);
VLOG_WITH_PREFIX(3) << "Considering MM op " << op->name();
// Update op stats.
stats.Clear();
op->UpdateStats(&stats);
if (op->cancelled() || !stats.valid() || !stats.runnable()) {
continue;
}
if (stats.logs_retained_bytes() > low_io_most_logs_retained_bytes &&
op->io_usage() == MaintenanceOp::LOW_IO_USAGE) {
low_io_most_logs_retained_bytes_op = op;
low_io_most_logs_retained_bytes = stats.logs_retained_bytes();
VLOG_AND_TRACE("maintenance", 2) << LogPrefix() << "Op " << op->name() <<
" can free "
<< stats.logs_retained_bytes() << "
bytes of logs";
}
if (stats.ram_anchored() > most_mem_anchored) {
most_mem_anchored_op = op;
most_mem_anchored = stats.ram_anchored();
}
// We prioritize ops that can free more logs, but when it's the same we
pick the one that
// also frees up the most memory.
if (stats.logs_retained_bytes() > 0 &&
(stats.logs_retained_bytes() > most_logs_retained_bytes ||
(stats.logs_retained_bytes() == most_logs_retained_bytes &&
stats.ram_anchored() > most_logs_retained_bytes_ram_anchored)))
{
most_logs_retained_bytes_op = op;
most_logs_retained_bytes = stats.logs_retained_bytes();
most_logs_retained_bytes_ram_anchored = stats.ram_anchored();
}
if (stats.data_retained_bytes() > most_data_retained_bytes) {
most_data_retained_bytes_op = op;
most_data_retained_bytes = stats.data_retained_bytes();
VLOG_AND_TRACE("maintenance", 2) << LogPrefix() << "Op " << op->name() <<
" can free "
<< stats.data_retained_bytes() << "
bytes of data";
}
if ((!best_perf_improvement_op) ||
(stats.perf_improvement() > best_perf_improvement)) {
best_perf_improvement_op = op;
best_perf_improvement = stats.perf_improvement();
}
}
// Look at ops that we can run quickly that free up log retention.
if (low_io_most_logs_retained_bytes_op) {
if (low_io_most_logs_retained_bytes > 0) {
VLOG_AND_TRACE("maintenance", 1) << LogPrefix()
<< "Performing " <<
low_io_most_logs_retained_bytes_op->name() << ", "
<< "because it can free up more logs "
<< "at " << low_io_most_logs_retained_bytes
<< " bytes with a low IO cost";
return low_io_most_logs_retained_bytes_op;
}
}
// Look at free memory. If it is dangerously low, we must select something
// that frees memory-- the op with the most anchored memory.
double capacity_pct;
if (memory_pressure_func_(&capacity_pct)) {
if (!most_mem_anchored_op) {
std::string msg = StringPrintf("we have exceeded our soft memory limit "
"(current capacity is %.2f%%). However, there are no ops currently "
"runnable which would free memory.", capacity_pct);
LOG_WITH_PREFIX(INFO) << msg;
return nullptr;
}
VLOG_AND_TRACE("maintenance", 1) << LogPrefix() << "We have exceeded our
soft memory limit "
<< "(current capacity is " << capacity_pct << "%). Running the op "
<< "which anchors the most memory: " <<
most_mem_anchored_op->name();
return most_mem_anchored_op;
}
if (most_logs_retained_bytes_op &&
most_logs_retained_bytes / 1024 / 1024 >=
FLAGS_log_target_replay_size_mb) {
VLOG_AND_TRACE("maintenance", 1) << LogPrefix()
<< "Performing " << most_logs_retained_bytes_op->name() << ", "
<< "because it can free up more logs (" << most_logs_retained_bytes
<< " bytes)";
return most_logs_retained_bytes_op;
}
// Look at ops that we can run quickly that free up data on disk.
if (most_data_retained_bytes_op &&
most_data_retained_bytes > FLAGS_data_gc_min_size_mb * 1024 * 1024) {
if (!best_perf_improvement_op || best_perf_improvement <= 0 ||
rand_.NextDoubleFraction() <= FLAGS_data_gc_prioritization_prob) {
VLOG_AND_TRACE("maintenance", 1) << LogPrefix()
<< "Performing " << most_data_retained_bytes_op->name() <<
", "
<< "because it can free up more data "
<< "at " << most_data_retained_bytes << " bytes";
return most_data_retained_bytes_op;
}
VLOG(1) << "Skipping data GC due to prioritizing perf improvement";
}
if (best_perf_improvement_op && best_perf_improvement > 0) {
VLOG_AND_TRACE("maintenance", 1) << LogPrefix() << "Performing "
<< best_perf_improvement_op->name() << ", "
<< "because it had the best perf_improvement score, "
<< "at " << best_perf_improvement;
return best_perf_improvement_op;
}
return nullptr;
}
{code}
when restart kudu cluster should not FindBestOp is most_data_retained_bytes_op
> kudu cluster restart very long time to reused
> ---------------------------------------------
>
> Key: KUDU-2638
> URL: https://issues.apache.org/jira/browse/KUDU-2638
> Project: Kudu
> Issue Type: Improvement
> Reporter: jiaqiyang
> Priority: Major
> Fix For: n/a
>
> Attachments: kudu16.tc.tablet.png, tserverLog.tar.gz
>
>
> when restart my kudu cluster ;all tablet not avalible:
> run kudu cluster ksck show that:
> Table Summary
>
>
> Name | Status | Total Tablets | Healthy | Under-replicated | Unavailable
> --------------------------------------------------------------------------------+------------
> t1 | HEALTHY | 1 | 1 | 0 | 0
> t2 | UNAVAILABLE | 5 | 0 | 1 | 4
> t3 | UNAVAILABLE | 6 | 2 | 0 | 4
> t3 | UNAVAILABLE | 3 | 0 | 0 | 3
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)