This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to tag kudu-1.12.0-mdh1.0.0-4c2c075-centos-release in repository https://gitbox.apache.org/repos/asf/kudu.git
commit b42d25b4dc5c41d3a379f0a4324910612472660c Author: zhangyifan8 <[email protected]> AuthorDate: Fri Jun 5 18:12:01 2020 +0800 [script] fix minos_control_server.py for newer versions According to https://kudu.apache.org/docs/administration.html#rolling_restart, fix minos_control_server.py. And also add some new metrics introduced in 1.12.0. --- src/kudu/scripts/falcon_screen.json | 30 +++++++++++++- src/kudu/scripts/falcon_screen.py | 1 + src/kudu/scripts/minos_control_server.py | 69 ++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/src/kudu/scripts/falcon_screen.json b/src/kudu/scripts/falcon_screen.json index 26b7acd..8f467e5 100644 --- a/src/kudu/scripts/falcon_screen.json +++ b/src/kudu/scripts/falcon_screen.json @@ -44,6 +44,8 @@ "metric=key_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=key_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=last_read_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=last_write_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4", "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_append_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_bytes_logged service=kudu cluster=${cluster.name} level=${level} v=4", @@ -84,6 +86,7 @@ "metric=state service=kudu cluster=${cluster.name} level=${level} v=4", "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4", "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=transaction_memory_limit_rejections service=kudu cluster=${cluster.name} level=${level} v=4", "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_estimated_retained_bytes service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_gc_bytes_deleted service=kudu cluster=${cluster.name} level=${level} v=4", @@ -173,7 +176,12 @@ "metric=kudu-table-health service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_l": [ + "metric=last_read_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=last_write_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4", "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_log": [ "metric=log_append_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_append_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_append_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", @@ -207,8 +215,7 @@ "metric=log_sync_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_sync_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_sync_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", - "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", - "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4" + "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_mn": [ "metric=majority_done_ops service=kudu cluster=${cluster.name} level=${level} v=4", @@ -276,6 +283,7 @@ "table_tw": [ "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4", "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=transaction_memory_limit_rejections service=kudu cluster=${cluster.name} level=${level} v=4", "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", "metric=write_op_duration_client_propagated_consistency_mean service=kudu cluster=${cluster.name} level=${level} v=4", "metric=write_op_duration_client_propagated_consistency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", @@ -401,6 +409,7 @@ "metric=rpcs_queue_overflow service=kudu cluster=${cluster.name} level=host v=4", "metric=rpcs_timed_out_in_queue service=kudu cluster=${cluster.name} level=host v=4", "metric=spinlock_contention_time service=kudu cluster=${cluster.name} level=host v=4", + "metric=sys_catalog_oversized_write_requests service=kudu cluster=${cluster.name} level=host v=4", "metric=tablet_copy_bytes_sent service=kudu cluster=${cluster.name} level=host v=4", "metric=tablet_copy_open_source_sessions service=kudu cluster=${cluster.name} level=host v=4", "metric=tcmalloc_current_total_thread_cache_bytes service=kudu cluster=${cluster.name} level=host v=4", @@ -491,6 +500,7 @@ "metric=log_block_manager_dead_containers_deleted service=kudu cluster=${cluster.name} level=host v=4", "metric=log_block_manager_full_containers service=kudu cluster=${cluster.name} level=host v=4", "metric=log_block_manager_holes_punched service=kudu cluster=${cluster.name} level=host v=4", + "metric=num_raft_leaders service=kudu cluster=${cluster.name} level=host v=4", "metric=op_apply_queue_length_percentile_99 service=kudu cluster=${cluster.name} level=host v=4", "metric=op_apply_queue_time_percentile_99 service=kudu cluster=${cluster.name} level=host v=4", "metric=op_apply_run_time_percentile_99 service=kudu cluster=${cluster.name} level=host v=4", @@ -744,6 +754,22 @@ ] }, { + "screen": "${cluster.name} [metrics_log]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_log" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { "screen": "${cluster.name} [metrics_mn]", "graphs": [ { diff --git a/src/kudu/scripts/falcon_screen.py b/src/kudu/scripts/falcon_screen.py index 1695ca5..000d05f 100755 --- a/src/kudu/scripts/falcon_screen.py +++ b/src/kudu/scripts/falcon_screen.py @@ -39,6 +39,7 @@ screenIdList = { "[metrics_f]", "[metrics_ghijk]", "[metrics_l]", + "[metrics_log]", "[metrics_mn]", "[metrics_o]", "[metrics_r]", diff --git a/src/kudu/scripts/minos_control_server.py b/src/kudu/scripts/minos_control_server.py index 875802c..e35bf33 100755 --- a/src/kudu/scripts/minos_control_server.py +++ b/src/kudu/scripts/minos_control_server.py @@ -53,7 +53,7 @@ def get_host(host_port): def is_cluster_health(): status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -consensus=false' - ' -ksck_format=json_compact -color=never' + ' -ksck_format=json_compact -quiescing_info=false -color=never' ' -sections=MASTER_SUMMARIES,TSERVER_SUMMARIES,TABLE_SUMMARIES' ' 2>/dev/null' % cluster) @@ -140,6 +140,22 @@ def get_tablet_server_info(hostname, tservers_info): return rpc_address, uuid +def get_cluster_version(): + version_info = dict() + version = '' + min_version = '9.9.9' + status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -sections=VERSION_SUMMARIES ' + '-quiescing_info=false -ksck_format=json_compact' + % cluster) + exit_if_failed(status, output) + version_info = json.loads(output) + for item in version_info['version_summaries']: + version = item['version'] + if version < min_version: + min_version = version + return min_version + + def set_flag(rpc_address, seconds): cmd = ('${KUDU_HOME}/kudu tserver set_flag %s follower_unavailable_considered_failed_sec %s' % (rpc_address, seconds)) @@ -147,13 +163,42 @@ def set_flag(rpc_address, seconds): exit_if_failed(status, output) +def maintain_tserver(op_type, ts_uuid): + cmd = ('${KUDU_HOME}/kudu tserver state %s @%s %s' + % (op_type, cluster, ts_uuid)) + status, output = commands.getstatusoutput(cmd) + exit_if_failed(status, output) + + +def wait_tserver_quiesce(rpc_address): + print(time_header() + 'Start to quiesce tserver ' + rpc_address) + cmd = ('${KUDU_HOME}/kudu tserver quiesce start %s -error_if_not_fully_quiesced' % (rpc_address)) + is_quiesced = False + while not is_quiesced: + status, output = commands.getstatusoutput(cmd) + if status == 0: + print(time_header() + 'Tablet server is fully quiesced.') + is_quiesced = True + else: + print(time_header() + output) + time.sleep(1) + + def rebalance_cluster(blacklist_tserver_uuid): ignored_tservers_uuid = set() for node in known_unhealth_nodes: rpc_address, uuid = get_tablet_server_info(node, tservers_info) ignored_tservers_uuid.add(uuid) - cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s' - % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid)))) + if blacklist_tserver_uuid == '': + cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -ignored_tservers=%s -quiescing_info=false' + % (cluster, str(','.join(ignored_tservers_uuid)))) + elif version < '1.11': + cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s -quiescing_info=false' + % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid)))) + else: + ignored_tservers_uuid.add(blacklist_tserver_uuid) + cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -ignored_tservers=%s -move_replicas_from_ignored_tservers' + % (cluster, str(','.join(ignored_tservers_uuid)))) p = subprocess.Popen(cmd, stdout = subprocess.PIPE, shell=True) for line in iter(p.stdout.readline, b''): print line @@ -184,8 +229,10 @@ check_parameter('You will rebalance cluster after operation: %s? (y/n)', rebalan tservers_info = get_tservers_info() wait_cluster_health() +version = get_cluster_version() +print('The cluster version(before rolling_update) is ' + version) -if 'tablet_server' in job and operate in ['restart', 'rolling_update']: +if version < '1.11' and 'tablet_server' in job and operate in ['restart', 'rolling_update']: for tserver in tservers_info: set_flag(tserver['rpc-addresses'], 7200) @@ -195,13 +242,16 @@ for task in tasks: exit() if 'tablet_server' in job: - cmd = ('%s/deploy show kudu %s --job %s --task %d' - % (minos_client_path, cluster, job, task)) + cmd = ('%s/deploy show kudu %s --job %s --task %d' % (minos_client_path, cluster, job, task)) status, output = commands.getstatusoutput(cmd) exit_if_failed(status, output) print(output) hostname = parse_node_from_minos_output(output, job) rpc_address, uuid = get_tablet_server_info(hostname, tservers_info) + if version > '1.11': + maintain_tserver("enter_maintenance", uuid) + if version > '1.12': + wait_tserver_quiesce(rpc_address) if operate == 'stop': # migrate replicas on tserver rebalance_cluster(uuid) @@ -218,12 +268,15 @@ for task in tasks: wait_cluster_health() if 'tablet_server' in job and operate in ['restart', 'rolling_update']: - set_flag(rpc_address, 7200) + if version < '1.11': + set_flag(rpc_address, 7200) + else: + maintain_tserver("exit_maintenance", uuid) print(time_header() + '==========================') time.sleep(10) -if 'tablet_server' in job and operate in ['restart', 'rolling_update']: +if version < '1.11' and 'tablet_server' in job and operate in ['restart', 'rolling_update']: for tserver in tservers_info: set_flag(tserver['rpc-addresses'], default_follower_unavailable_considered_failed_sec)
