This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to tag kudu-1.12.0-mdh1.0.0-4c2c075-centos-release in repository https://gitbox.apache.org/repos/asf/kudu.git
commit e09b284b8fc41701ef017e3aacdfab5543332fe1 Author: laiyingchun <[email protected]> AuthorDate: Wed Jan 9 19:27:16 2019 +0800 [script] Add script tools --- kudu | 1 + src/kudu/scripts/batch_operate_on_tables.sh | 68 ++++ src/kudu/scripts/build_env.sh | 24 ++ src/kudu/scripts/build_kudu.sh | 162 ++++++++ src/kudu/scripts/cal_bill_daily.py | 280 +++++++++++++ src/kudu/scripts/falcon_screen.json | 603 ++++++++++++++++++++++++++++ src/kudu/scripts/falcon_screen.py | 603 ++++++++++++++++++++++++++++ src/kudu/scripts/kudu_falcon_screen.sh | 119 ++++++ src/kudu/scripts/kudu_utils.py | 106 +++++ src/kudu/scripts/kudurc | 69 ++++ src/kudu/scripts/minos_control_server.py | 225 +++++++++++ 11 files changed, 2260 insertions(+) diff --git a/kudu b/kudu new file mode 120000 index 0000000..f2638cd --- /dev/null +++ b/kudu @@ -0,0 +1 @@ +build/release/bin/kudu \ No newline at end of file diff --git a/src/kudu/scripts/batch_operate_on_tables.sh b/src/kudu/scripts/batch_operate_on_tables.sh new file mode 100755 index 0000000..09a0e3e --- /dev/null +++ b/src/kudu/scripts/batch_operate_on_tables.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +if [ $# -lt 3 ] +then + echo "This tool is for batch operation on batch of tables in a cluster" + echo "USAGE: $0 file operate cluster [dst-cluster]" + echo " file: A file contains several table names in a cluster, one table name per line." + echo " Or 'auto' means all tables in this cluster" + echo " operate: Now support 'copy', 'delete', 'describe' and 'scan'" + echo " cluster: CLuster name or master RPC addresses" + echo " dst-cluster: Master addresses of destination cluster, needed only when 'operate' is 'copy'" + exit -1 +fi + +FILE=$1 +OPERATE=$2 +CLUSTER=$3 +DST_CLUSTER=$4 +#FLAGS="-show_attributes" +#FLAGS="-create_table=false -write_type=upsert" +BIN_PATH=${KUDU_HOME}/kudu +PID=$$ + +echo "UID: ${UID}" +echo "PID: ${PID}" +echo "tables:" +if [ "${FILE}" == "auto" ] +then + echo "All tables in the cluster" +else + cat ${FILE} +fi +echo "operate: ${OPERATE}" +echo "cluster: ${CLUSTER}" +echo "dst cluster: ${DST_CLUSTER}" +echo "flags: ${FLAGS}" + +echo "" +echo "All params above have been checked? (yes)" +read INPUT +if [ ! -n "${INPUT}" ] || [ "${INPUT}" != "yes" ] +then + exit $? +fi + +if [ -n "${DST_CLUSTER}" ] +then + DST_CLUSTER=@${DST_CLUSTER} +fi + +if [ "${FILE}" == "auto" ] +then + TABLE_LIST=/tmp/$UID.${PID}.table.list + ${BIN_PATH} table list @${CLUSTER} | sort -n >${TABLE_LIST} +else + TABLE_LIST=${FILE} +fi + +if [ ! -f "${TABLE_LIST}" ] +then + echo "file ${TABLE_LIST} is not exist!" + exit $? +fi + +while read TABLE +do + ${BIN_PATH} table ${OPERATE} @${CLUSTER} ${TABLE} ${DST_CLUSTER} ${FLAGS} +done < ${TABLE_LIST} diff --git a/src/kudu/scripts/build_env.sh b/src/kudu/scripts/build_env.sh new file mode 100755 index 0000000..dde2d19 --- /dev/null +++ b/src/kudu/scripts/build_env.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +OS=`lsb_release -d | awk '{print $2}'` +echo "Setup build env for kudu on $OS" + +if [[ "$OS" == "CentOS" ]]; then + sudo yum -y install autoconf automake cyrus-sasl-devel cyrus-sasl-gssapi \ + cyrus-sasl-plain flex gcc gcc-c++ gdb git java-1.8.0-openjdk-devel \ + krb5-server krb5-workstation libtool make openssl-devel patch pkgconfig \ + redhat-lsb-core rsync unzip vim-common which + DTLS_RPM=rhscl-devtoolset-3-epel-6-x86_64-1-2.noarch.rpm + DTLS_RPM_URL=https://www.softwarecollections.org/repos/rhscl/devtoolset-3/epel-6-x86_64/noarch/${DTLS_RPM} + wget ${DTLS_RPM_URL} -O ${DTLS_RPM} + sudo yum install -y scl-utils ${DTLS_RPM} + sudo yum install -y devtoolset-3-toolchain +elif [[ "$OS" == "Ubuntu" ]]; then + sudo apt-get -y install autoconf automake curl flex g++ gcc gdb git \ + krb5-admin-server krb5-kdc krb5-user libkrb5-dev libsasl2-dev libsasl2-modules \ + libsasl2-modules-gssapi-mit libssl-dev libtool lsb-release make ntp \ + openjdk-8-jdk openssl patch pkg-config python rsync unzip vim-common +else + echo "Unsupported OS: $OS in $0" + exit +fi diff --git a/src/kudu/scripts/build_kudu.sh b/src/kudu/scripts/build_kudu.sh new file mode 100755 index 0000000..1016697 --- /dev/null +++ b/src/kudu/scripts/build_kudu.sh @@ -0,0 +1,162 @@ +#!/bin/bash + +BASE_DIR="$( cd "$( dirname "$0" )" && cd ../../.. && pwd )" + +function usage() +{ + echo "Options:" + echo " -h" + echo " -g|--custom-gcc" + exit 0 +} + +#USAGE: copy_file src [src...] dest +function copy_file() { + if [[ $# -lt 2 ]]; then + echo "ERROR: invalid copy file command: cp $*" + exit 1 + fi + cp -v $* + if [[ $? -ne 0 ]]; then + echo "ERROR: copy file failed: cp $*" + exit 1 + fi +} + +function get_stdcpp_lib() +{ + libname=`ldd ${BASE_DIR}/build/latest/bin/kudu 2>/dev/null | grep libstdc++` + libname=`echo $libname | cut -f1 -d" "` + if [ $1 = "true" ]; then + gcc_path=`which gcc` + echo `dirname $gcc_path`/../lib64/$libname + else + libs=(`ldconfig -p|grep $libname|awk '{print $NF}'`) + + for lib in ${libs[*]}; do + if [ "`check_bit $lib`" = "true" ]; then + echo "$lib" + return + fi + done; + fi +} + +function check_bit() +{ + bit_mode=`getconf LONG_BIT` + lib=$1 + check_bit="" + is_softlink=`file $lib | grep "symbolic link"` + + if [ -z "$is_softlink" ]; then + check_bit=`file $lib |grep "$bit_mode-bit"` + else + real_lib_name=`ls -l $lib |awk '{print $NF}'` + lib_path=${lib%/*} + real_lib=${lib_path}"/"${real_lib_name} + check_bit=`file $real_lib |grep "$bit_mode-bit"` + fi + if [ -n "$check_bit" ]; then + echo "true" + fi +} + +custom_gcc="false" +while [[ $# > 0 ]]; do + option_key="$1" + case $option_key in + -g|--custom-gcc) + custom_gcc="true" + ;; + -h|--help) + usage + ;; + esac + shift +done + +KUDU_VERSION=`cat ${BASE_DIR}/version.txt` +OS=`lsb_release -d | awk '{print $2}'` +echo "Start to build kudu $KUDU_VERSION on $OS" + +if [[ "$OS" == "CentOS" ]]; then + ${BASE_DIR}/build-support/enable_devtoolset.sh + ${BASE_DIR}/thirdparty/build-if-necessary.sh +elif [[ "$OS" == "Ubuntu" ]]; then + ${BASE_DIR}/thirdparty/build-if-necessary.sh +else + echo "ERROR: unsupported OS: $OS in $0" + exit 1 +fi + +rm -rf ${BASE_DIR}/build/release +mkdir -p ${BASE_DIR}/build/release +cd ${BASE_DIR}/build/release +../../thirdparty/installed/common/bin/cmake -DCMAKE_BUILD_TYPE=release ../.. +make -j `cat /proc/cpuinfo | egrep "^processor\s:" | wc -l` +if [[ $? -ne 0 ]]; then + echo "ERROR: build Kudu failed" + exit 1 +fi +echo "Build Kudu succeed" + +VERSION_DEFINES=${BASE_DIR}/build/release/src/kudu/generated/version_defines.h +if [[ ! -f ${VERSION_DEFINES} ]]; then + echo "ERROR: $VERSION_DEFINES not found" + exit 1 +fi + +CLEAN_REPO=`grep "^#define KUDU_BUILD_CLEAN_REPO " ${VERSION_DEFINES} | awk '{print $NF}' | tr 'A-Z' 'a-z'` +if [[ "$CLEAN_REPO"x != "true"x ]]; then + echo "ERROR: repository is not clean" + exit 1 +fi + +VERSION=`grep "^#define KUDU_VERSION_STRING " ${VERSION_DEFINES} | cut -d "\"" -f 2` +COMMIT_ID=`grep "^#define KUDU_GIT_HASH " ${VERSION_DEFINES} | cut -d "\"" -f 2` +BUILD_TYPE=`grep "^#define KUDU_BUILD_TYPE " ${VERSION_DEFINES} | cut -d "\"" -f 2` +PACK_VERSION=`echo ${VERSION}-${COMMIT_ID:0:7}-${OS}-${BUILD_TYPE} | tr 'A-Z' 'a-z'` +PACK_NAME=kudu-${PACK_VERSION} + +echo "Starting package $PACK_NAME" +PACK_DIR=${BASE_DIR}/build/${PACK_NAME} +PACKAGE=${PACK_NAME}.tar.gz +rm -rf ${PACK_DIR} ${BASE_DIR}/build/${PACKAGE} +mkdir -p ${PACK_DIR} +echo "Coping files to $PACK_DIR" +copy_file ${BASE_DIR}/build/latest/bin/kudu-collector ${PACK_DIR}/kudu_collector +copy_file ${BASE_DIR}/build/latest/bin/kudu-master ${PACK_DIR}/kudu_master +copy_file ${BASE_DIR}/build/latest/bin/kudu-tserver ${PACK_DIR}/kudu_tablet_server +copy_file ${BASE_DIR}/build/latest/bin/kudu ${PACK_DIR}/ +copy_file `get_stdcpp_lib $custom_gcc` ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/batch_operate_on_tables.sh ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/falcon_screen.json ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/falcon_screen.py ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/kudu_falcon_screen.sh ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/minos_control_server.py ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/cal_bill_daily.py ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/kudu_utils.py ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/start_local_kudu.sh ${PACK_DIR}/ +copy_file ${BASE_DIR}/src/kudu/scripts/kudurc ${PACK_DIR}/ +copy_file -r ${BASE_DIR}/www ${PACK_DIR}/ +cd ${BASE_DIR}/build +tar -czf ${PACKAGE} ${PACK_NAME} +echo "Packaged $PACKAGE succeed" + +PACK_TEMPLATE="" +if [[ -n "$MINOS_CONFIG_FILE" ]]; then + PACK_TEMPLATE=`dirname $MINOS_CONFIG_FILE`/xiaomi-config/package/kudu.yaml +fi + +if [[ -f ${PACK_TEMPLATE} ]]; then + echo "Modifying $PACK_TEMPLATE ..." + sed -i "/^version:/c version: \"$PACK_VERSION\"" ${PACK_TEMPLATE} + sed -i "/^build:/c build: \"\.\/run.sh pack\"" ${PACK_TEMPLATE} + sed -i "/^source:/c source: \"$BASE_DIR/build\"" ${PACK_TEMPLATE} +else + echo "ERROR: modify kudu.yaml failed" + exit 1 +fi + +echo "Done" diff --git a/src/kudu/scripts/cal_bill_daily.py b/src/kudu/scripts/cal_bill_daily.py new file mode 100755 index 0000000..a92b72b --- /dev/null +++ b/src/kudu/scripts/cal_bill_daily.py @@ -0,0 +1,280 @@ +#! /usr/bin/env python +# coding=utf-8 + +import commands +import datetime +from git import Repo +import heapq +import logging +from logging.handlers import RotatingFileHandler +import json +import os +import re +import sys +import time +import kudu_utils +import yaml + + +g_ignore_db_set = ('system', 'lcsbinlog', 'default', 'zhangxu_test_kudu') +g_month_path, g_month_data_path = kudu_utils.prepare_pricing_month_path() +g_clusters_info_dict = yaml.load(open(kudu_utils.g_script_path + '/kudurc', 'r').read(), Loader=yaml.FullLoader) +g_clusters_info = g_clusters_info_dict['clusters_info'] +g_commit_filenames = list() +g_git_repo_dir = '' + + +def printtsr(level, table, size, reason): + kudu_utils.LOG.log(level, 'table: ' + table + (', size: %fG' % (size/(1 << 30)) + ', reason: ') + reason) + + +class TopKHeap(object): + def __init__(self, k): + self.k = k + self.data = [] + + def push(self, elem): + if len(self.data) < self.k: + heapq.heappush(self.data, elem) + else: + top_k_small = self.data[0] + if elem['size'] > top_k_small['size']: + heapq.heapreplace(self.data, elem) + + def top_k(self): + return {x['table']: x['size'] for x in reversed([heapq.heappop(self.data) for _ in xrange(len(self.data))])} + + +def add_org_size(dbtable, org, size, org_size_desc): + if len(org) == 0: + printtsr(logging.WARNING, dbtable, size, 'Org name is empty') + return False + + if org not in org_size_desc.keys(): + org_size_desc[org] = {} + org_size_desc[org]['size'] = 0 + org_size_desc[org]['desc'] = TopKHeap(10) + org_size_desc[org]['size'] += size + org_size_desc[org]['desc'].push({'size': size, 'table': dbtable}) + return True + + +def get_org_size_desc_from_olap(cluster_name, dbtable_size_dict, known_db_org_dict): + db_org_dict = {} + meta_table = 'system.kudu_table_owners' + cmd = '%s/kudu table scan @%s %s -show_values=true' \ + ' -columns=name,db,org 2>&1 | grep "(string name=\\\""'\ + % (kudu_utils.script_path(), cluster_name, meta_table) + status, output = commands.getstatusoutput(cmd) + if status != 0: + kudu_utils.LOG.error('Scan table %s error, command %s, status %d, output \n%s' % (meta_table, cmd, status, output)) + else: + for line in output.splitlines(): + match_obj = re.search(r'string name="(.*)", string db="(.*)", string org="(.*)"', line, re.M | re.I) + if match_obj: + db = match_obj.group(2) + org = match_obj.group(3) + db_org_dict[db] = org + else: + kudu_utils.LOG.error('Table %s value format error, line\n%s' % (meta_table, line)) + + total_ignored_size = 0.0 + org_size_desc = {} + for dbtable, size in dbtable_size_dict.iteritems(): + db_table_list = dbtable.split('.') + if len(db_table_list) != 2: + total_ignored_size += size + printtsr(logging.WARNING, dbtable, size, 'Lack db') + continue + + db, table = db_table_list[0], db_table_list[1] + if db in g_ignore_db_set: + total_ignored_size += size + printtsr(logging.INFO, dbtable, size, 'Ignored table') + continue + + if db in known_db_org_dict.keys(): + # 'org' from config file + org = known_db_org_dict[db] + elif db in db_org_dict.keys(): + # 'org' from system table + org = db_org_dict[db] + else: + total_ignored_size += size + printtsr(logging.WARNING, db, size, 'Lack org ID') + continue + + if not add_org_size(dbtable, org, size, org_size_desc): + total_ignored_size += size + continue + + printtsr(logging.WARNING, 'TOTAL', total_ignored_size, 'Total ignored size') + return org_size_desc + + +def get_cluster_stat_filename(date, cluster_name): + return g_month_data_path + date + '_' + cluster_name + + +def get_service_usage_filename(date): + return g_month_data_path + date + '_kudu_total' + + +def collect_origin_usage_for_cluster(cluster_name, cluster_info): + kudu_utils.LOG.info('Start to collect usage info for cluster %s' % cluster_name) + # Output: db.table size + cmd = '%s/kudu_collector -collector_master_addrs=%s ' \ + '-collector_report_method=local -collector_metrics=on_disk_size -log_dir=./log | ' \ + 'egrep "^table on_disk_size " | sort | awk \'{print $3, $4}\'' \ + % (kudu_utils.g_script_path, cluster_info['master_addresses']) + status, output = commands.getstatusoutput(cmd) + if status != 0: + kudu_utils.LOG.fatal('Table stat error') + return + + dbtable_size_dict = {} + for dbtable_size_str in output.splitlines(): + dbtable_size_list = dbtable_size_str.split(' ') + assert(len(dbtable_size_list) == 2) + dbtable_size_dict[dbtable_size_list[0]] = float(dbtable_size_list[1]) + known_db_org_dict = {} + if 'special_db_org' in cluster_info.keys(): + known_db_org_dict = cluster_info['special_db_org'] + org_size_desc = get_org_size_desc_from_olap(cluster_name, dbtable_size_dict, known_db_org_dict) + + results = [] + date = time.strftime('%Y-%m-%d', time.localtime()) + period = int(time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple())) + for org, size_desc in org_size_desc.iteritems(): + result = dict() + result['period'] = period + result['service_name'] = 'kudu' + result['region_name'] = cluster_info['region'] + result['charge_type_name'] = cluster_info['charge_type'] + result['instance_name'] = cluster_info['instance'] + result['cluster'] = cluster_name + result['account_type'] = 'org' if org.find('CL') != -1 else 'kerberos' + result['account'] = org + result['usage'] = size_desc['size'] + result['charge_object'] = size_desc['desc'].top_k() + results.append(result) + origin_usage_filename = get_cluster_stat_filename(date, cluster_name) + with open(origin_usage_filename, 'w') as origin_usage_file: + json.dump(results, origin_usage_file) + origin_usage_file.close() + + g_commit_filenames.append(origin_usage_filename) + + +def get_cluster_info(cluster_name): + if cluster_name not in g_clusters_info.keys(): + kudu_utils.LOG.fatal('Cluster %s not found' % cluster_name) + return None + + cluster_info = g_clusters_info[cluster_name] + if cluster_info['charge_type'] == 'public_share': + kudu_utils.LOG.warning('Ignore public_share cluster %s' % cluster_name) + return None + + return cluster_info + + +def collect_origin_usage_for_clusters(cluster_name_list): + for cluster_name in cluster_name_list: + cluster_info = get_cluster_info(cluster_name) + if not cluster_info: + continue + collect_origin_usage_for_cluster(cluster_name, cluster_info) + + +def calc_usage_result(origin_usage_filename, service_usage_file): + kudu_utils.LOG.info('Start to process daily statistics file %s' % origin_usage_filename) + if not os.path.exists(origin_usage_filename): + kudu_utils.LOG.error('File not exist') + return + with open(origin_usage_filename, 'r') as origin_usage_file: + users_usage = json.load(origin_usage_file) + for user_usage in users_usage: + service_usage_file.write('%s, %s, %s, %s, %s, %s, %s, %s, \'{"storage_bytes":%d}\', \'%s\'\n' + % (user_usage['period'], + user_usage['service_name'], + user_usage['region_name'], + user_usage['charge_type_name'], + user_usage['instance_name'], + user_usage['cluster'], + user_usage['account_type'], + user_usage['account'], + user_usage['usage'], + json.dumps(user_usage['charge_object']))) + origin_usage_file.close() + kudu_utils.LOG.info('Write to file finished') + + +def calc_usage_result_for_cluster(service_usage_file, cluster_name, date): + origin_usage_filename = get_cluster_stat_filename(date, cluster_name) + calc_usage_result(origin_usage_filename, service_usage_file) + + +def calc_usage_result_for_clusters(cluster_name_list, date_list): + for date in date_list: + service_usage_filename = get_service_usage_filename(date) + with open(service_usage_filename, 'w') as service_usage_file: + # Write header + service_usage_file.write('period, service_name, region_name, charge_type_name, instance_name, ' + 'cluster, account_type, account, usage, charge_object\n') + for cluster_name in cluster_name_list: + cluster_info = get_cluster_info(cluster_name) + if not cluster_info: + continue + calc_usage_result_for_cluster(service_usage_file, cluster_name, date) + service_usage_file.close() + kudu_utils.upload_usage_data('append', service_usage_filename) + g_commit_filenames.append(service_usage_filename) + + +def push_file_to_repo(filenames): + repo = Repo(g_git_repo_dir) + assert not repo.bare + + remote = repo.remote() + remote.pull() + + index = repo.index + index.add(filenames) + index.commit('Kudu add statistics files') + + remote.push() + + kudu_utils.LOG.info('Pushed files %s to repo' % str(filenames)) + + +def main(argv=None): + if not os.path.exists(g_git_repo_dir + '/.git'): + kudu_utils.LOG.fatal('You must set `g_git_repo_dir` to a valid directory contains `.git`') + return + + if argv is None: + argv = sys.argv + + cluster_name_list = [] + if len(argv) == 1: + # Calculate all clusters + cluster_name_list = list(g_clusters_info.iterkeys()) + elif len(argv) == 2: + # Calculate specified cluster + cluster_name_list.append(argv[1]) + else: + kudu_utils.LOG.fatal('Usage: $0 [cluster_name]') + return + + collect_origin_usage_for_clusters(cluster_name_list) + + # date_list = kudu_utils.get_date_list('2019-06-01', kudu_utils.get_date()) + date_list = [kudu_utils.get_date()] + calc_usage_result_for_clusters(cluster_name_list, date_list) + + push_file_to_repo(g_commit_filenames) + + +if __name__ == "__main__": + main() diff --git a/src/kudu/scripts/falcon_screen.json b/src/kudu/scripts/falcon_screen.json new file mode 100644 index 0000000..b15125c --- /dev/null +++ b/src/kudu/scripts/falcon_screen.json @@ -0,0 +1,603 @@ +{ + "comments": [ + { + "screen": "screen名称", + "graphs": [ + { + "title": "graph名称", + "endpoints": ["机器名或者tag标识,tag之间用空格分隔"], + "counters": ["counters名称,多个用逗号分隔。可以使用模糊匹配,支持metric(metric和tags(可选)空格隔开),精确匹配"], + "graph_type": "展示类型,endpoint视角为h,counters视角为k,组合视角为a", + "method": "绘图是否进行求和,求和填写sum,不求和填写空字符串", + "timespan": "展示的时间跨度,单位为秒" + } + ] + } + ], + "version": "20180827", + "counter_templates": { + "full": [ + "metric=kudu-tserver-health service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=kudu-table-health service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=all_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=alter_schema_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=average_diskrowset_height service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bytes_flushed service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=commit_wait_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=failed_elections_since_stable_leader service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=follower_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=in_progress_ops service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=insertions_failed_dup_key service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_append_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_bytes_logged service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_cache_num_ops service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_cache_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_entry_batches_per_group_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_group_commit_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_bytes_read service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_entries_read service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_read_batch_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_roll_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=majority_done_ops service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=memrowset_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=mrs_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=num_rowsets_on_disk service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=on_disk_data_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=on_disk_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_length_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_time_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_run_time_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=ops_behind_leader service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=raft_term service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=replica_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_deleted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_inserted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_updated service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_upserted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_bytes_returned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_bytes_scanned_from_disk service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_cells_returned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_cells_scanned_from_disk service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_rows_returned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_rows_scanned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scans_started service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=snapshot_read_inflight_wait_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=state service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_estimated_retained_bytes service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_bytes_deleted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_delete_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_init_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_perform_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=upserts_as_updates service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_client_propagated_consistency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_commit_wait_consistency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_ab": [ + "metric=all_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=alter_schema_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=average_diskrowset_height service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bytes_flushed service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_c": [ + "metric=commit_wait_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_d": [ + "metric=delta_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_f": [ + "metric=failed_elections_since_stable_leader service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=follower_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_ghijk": [ + "metric=in_progress_ops service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=insertions_failed_dup_key service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=kudu-table-health service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_l": [ + "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_append_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_bytes_logged service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_cache_num_ops service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_cache_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_entry_batches_per_group_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_group_commit_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_bytes_read service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_entries_read service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_read_batch_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_roll_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=lth service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_mn": [ + "metric=majority_done_ops service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=memrowset_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=mrs_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=num_rowsets_on_disk service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_o": [ + "metric=on_disk_data_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=on_disk_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_length_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_time_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_run_time_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=ops_behind_leader service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_r": [ + "metric=raft_term service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=replica_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_deleted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_inserted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_updated service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=rows_upserted service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_s": [ + "metric=scanner_bytes_returned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_bytes_scanned_from_disk service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_cells_returned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_cells_scanned_from_disk service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_rows_returned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scanner_rows_scanned service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=scans_started service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=snapshot_read_inflight_wait_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=state service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_u": [ + "metric=undo_delta_block_estimated_retained_bytes service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_bytes_deleted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_delete_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_init_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_perform_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=upserts_as_updates service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "table_tw": [ + "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_client_propagated_consistency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_commit_wait_consistency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "replica_count" : [ + "metric=replica_count service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "cluster" : [ + "metric=kudu.success service=kudu level=${level}", + "metric=kudu.writeLatency service=kudu level=${level}", + "metric=kudu.scanLatency service=kudu level=${level}", + "metric=healthy_table_proportion service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "cluster_stat" : [ + "metric=masters_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=tservers_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=tables_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=tablets_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=replicas_count service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=on_disk_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=on_disk_data_size service=kudu cluster=${cluster.name} level=${level} v=4" + ], + "sys" : [ + "cpu.busy", + "load.15min", + "load.1min", + "load.5min", + "mem.memused", + "mem.memused.percent", + "net.if.in.bytes/iface=eth0", + "net.if.out.bytes/iface=eth0", + "net.if.total.dropped/iface=eth0", + "sys.ntp.offset/procname=chronyd" + ], + "disk_usage_percent" : [ + "df.bytes.used.percent/fstype=ext4,mount=/home", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd1", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd2", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd3", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd4", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd5", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd6", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd7", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd8", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd9", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd10", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd11", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/ssd12", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd1", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd2", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd3", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd4", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd5", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd6", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd7", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd8", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd9", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd10", + "df.bytes.used.percent/fstype=ext4,mount=/home/work/hdd11" + ], + "disk_usage_size" : [ + "df.bytes.used/fstype=ext4,mount=/home", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd1", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd2", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd3", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd4", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd5", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd6", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd7", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd8", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd9", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd10", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd11", + "df.bytes.used/fstype=ext4,mount=/home/work/ssd12", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd1", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd2", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd3", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd4", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd5", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd6", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd7", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd8", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd9", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd10", + "df.bytes.used/fstype=ext4,mount=/home/work/hdd11" + ], + "disk_io_util" : [ + "disk.io.util/device=sdb", + "disk.io.util/device=sdc", + "disk.io.util/device=sdd", + "disk.io.util/device=sde", + "disk.io.util/device=sdf", + "disk.io.util/device=sdg", + "disk.io.util/device=sdh", + "disk.io.util/device=sdi", + "disk.io.util/device=sdj", + "disk.io.util/device=sdk", + "disk.io.util/device=sdl", + "disk.io.util/device=sdm", + "disk.io.util/device=nvme0n1", + "disk.io.util/device=nvme1n1", + "disk.io.util/device=nvme2n1", + "disk.io.util/device=nvme3n1", + "disk.io.util/device=nvme4n1", + "disk.io.util/device=nvme5n1", + "disk.io.util/device=xvda", + "disk.io.util/device=xvdb", + "disk.io.util/device=xvdc", + "disk.io.util/device=xvdd", + "disk.io.util/device=xvde", + "disk.io.util/device=xvdf", + "disk.io.util/device=vda", + "disk.io.util/device=vdb", + "disk.io.util/device=vdc", + "disk.io.util/device=vdd", + "disk.io.util/device=vde" + ] + }, + "details": [ + { + "screen": "${cluster.name} [cluster]", + "graphs": [ + { + "title": "集群可用度", + "endpoints": ["${cluster.name}"], + "counters": { + "level": "cluster", + "template": "cluster" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + }, + { + "title": "集群统计信息", + "endpoints": ["${cluster.name}"], + "counters": { + "level": "cluster", + "template": "cluster_stat" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_ab]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_ab" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_c]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_c" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_d]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_d" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_f]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_f" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_ghijk]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_ghijk" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_l]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_l" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_mn]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_mn" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_o]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_o" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_r]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_r" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_s]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_s" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_u]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_u" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [metrics_tw]", + "graphs": [ + { + "title": "单表metrics", + "endpoints": ["${for.each.table}"], + "counters": { + "level": "table", + "template": "table_tw" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [tserver]", + "graphs": [ + { + "title": "单节点metrics", + "endpoints": ["${for.each.tserver}"], + "counters": { + "level": "host", + "template": "full" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + } + ] + }, + { + "screen": "${cluster.name} [server-sys]", + "graphs": [ + { + "title": "单节点sys指标", + "endpoints": ["${for.each.tserver}", "${for.each.master}"], + "counters": { + "level": "host", + "template": "sys" + }, + "graph_type": "h", + "method": "", + "timespan": 86400 + }, + { + "title": "磁盘用量(百分比)", + "endpoints": ["${for.each.tserver}", "${for.each.master}"], + "counters": { + "level": "host", + "template": "disk_usage_percent" + }, + "graph_type": "a", + "method": "", + "timespan": 86400 + }, + { + "title": "磁盘用量(占用空间)", + "endpoints": ["${for.each.tserver}", "${for.each.master}"], + "counters": { + "level": "host", + "template": "disk_usage_size" + }, + "graph_type": "a", + "method": "", + "timespan": 86400 + }, + { + "title": "磁盘IO util", + "endpoints": ["${for.each.tserver}", "${for.each.master}"], + "counters": { + "level": "host", + "template": "disk_io_util" + }, + "graph_type": "a", + "method": "", + "timespan": 86400 + } + ] + } + ] +} diff --git a/src/kudu/scripts/falcon_screen.py b/src/kudu/scripts/falcon_screen.py new file mode 100755 index 0000000..26f330a --- /dev/null +++ b/src/kudu/scripts/falcon_screen.py @@ -0,0 +1,603 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import requests +import json +import re +import sys + +# +# RESTful API doc: http://wiki.n.miui.com/pages/viewpage.action?pageId=66037692 +# falcon ctrl api: http://dev.falcon.srv/doc/ +# + +# account info +serviceAccount = "" +serviceSeedMd5 = "" + +############################################################################### + +# global variables +falconServiceUrl = "http://falcon.srv" +# falconServiceUrl = "http://dev.falcon.srv" +kuduScreenId = 25748 +KUDU_CLUSTER_ID = 37613 +KUDU_TABLES_ID = 37638 +KUDU_TSERVER_ID = 37639 +KUDU_SYS_ID = 37640 +screenIdList = { + KUDU_CLUSTER_ID: "[cluster]", + KUDU_TABLES_ID: [ + "[metrics_ab]", + "[metrics_c]", + "[metrics_d]", + "[metrics_f]", + "[metrics_ghijk]", + "[metrics_l]", + "[metrics_mn]", + "[metrics_o]", + "[metrics_r]", + "[metrics_s]", + "[metrics_u]", + "[metrics_tw]"], + KUDU_TSERVER_ID: "[tserver]", + KUDU_SYS_ID: "[server-sys]"} +# kuduScreenId = 351 +sessionId = "" +metaPort = "" +replicaPort = "" +collectorPort = "" + + +# return: +def get_session_id(): + url = falconServiceUrl + "/v1.0/auth/info" + headers = { + "Accept": "text/plain" + } + + r = requests.get(url, headers=headers) + if r.status_code != 200: + print( + "ERROR: get_session_id failed, status_code = %s, result:\n%s" % + (r.status_code, r.text)) + sys.exit(1) + + c = r.headers['Set-Cookie'] + m = re.search('falconSessionId=([^;]+);', c) + if m: + global sessionId + sessionId = m.group(1) + print("INFO: sessionId =", sessionId) + else: + print("ERROR: get_session_id failed, cookie not set") + sys.exit(1) + + +# return: +def auth_by_misso(): + url = falconServiceUrl + "/v1.0/auth/callback/misso" + headers = { + "Cookie": "falconSessionId=" + + sessionId, + "Authorization": serviceAccount + + ";" + + serviceSeedMd5 + + ";" + + serviceSeedMd5} + + r = requests.get(url, headers=headers) + if r.status_code != 200: + print( + "ERROR: auth_by_misso failed, status_code = %s, result:\n%s" % + (r.status_code, r.text)) + sys.exit(1) + + +# return: +def check_auth_info(): + url = falconServiceUrl + "/v1.0/auth/info" + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.get(url, headers=headers) + if r.status_code != 200: + print( + "ERROR: check_auth_info failed, status_code = %s, result:\n%s" % + (r.status_code, r.text)) + sys.exit(1) + + j = json.loads(r.text) + if "user" not in j or j["user"] is None or "name" not in j["user"] or j["user"]["name"] != serviceAccount: + print("ERROR: check_auth_info failed, bad json result:\n%s" % r.text) + sys.exit(1) + + +def login(): + get_session_id() + auth_by_misso() + check_auth_info() + print("INFO: login succeed") + + +# return: +def logout(): + url = falconServiceUrl + "/v1.0/auth/logout" + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.get(url, headers=headers) + if r.status_code != 200: + print( + "ERROR: logout failed, status_code = %s, result:\n%s" % + (r.status_code, r.text)) + sys.exit(1) + + print("INFO: logout succeed") + + +# return: screenId +def create_screen(screenName, scrid): + url = falconServiceUrl + "/v1.0/dashboard/screen" + headers = { + "Cookie": "falconSessionId=" + sessionId + } + req = { + "pid": scrid, + "name": screenName + } + + r = requests.post(url, headers=headers, data=json.dumps(req)) + if r.status_code != 200: + print( + "ERROR: create_screen failed, screenName = %s, status_code = %s, result:\n%s" % + (screenName, r.status_code, r.text)) + sys.exit(1) + + j = json.loads(r.text) + if "id" not in j: + print( + "ERROR: create_screen failed, screenName = %s, bad json result\n%s" % + (screenName, r.text)) + sys.exit(1) + + screenId = j["id"] + print( + "INFO: create_screen succeed, screenName = %s, screenId = %s" % + (screenName, screenId)) + return screenId + + +def parse_lines(file_name): + lines = [] + for line in open(file_name): + line.strip() + if len(line) > 0: + if line in lines: + print("ERROR: bad file: duplicate line '%s'" % line) + sys.exit(1) + lines.append(line) + return lines + + +# return: screenConfigs +def prepare_screen_config( + clusterName, + templateName, + screenTemplateFile, + tableListFile, + masterListFile, + tserverListFile): + # tableList + tableList = parse_lines(tableListFile) + if len(tableList) == 0: + print("WARN: empty table list file, will not create table level falcon screen") + + # masterList + masterList = parse_lines(masterListFile) + if len(masterList) == 0: + print("ERROR: bad master list file: should be non-empty list") + sys.exit(1) + + # tserverList + tserverList = parse_lines(tserverListFile) + if len(tserverList) == 0: + print("ERROR: bad tserver list file: should be non-empty list") + sys.exit(1) + + # template json + jsonData = json.loads(open(screenTemplateFile).read()) + templateJson = jsonData['counter_templates'] + screensJson = jsonData['details'] + if not isinstance(screensJson, list) or len(screensJson) == 0: + print( + "ERROR: bad screen template json: [details] should be provided as non-empty list") + sys.exit(1) + + screenConfigs = {} + for screenJson in screensJson: + # screen name + screen = screenJson["screen"] + if not isinstance(screen, (str, unicode)) or len(screen) == 0: + print( + "ERROR: bad json: [details][screen]: should be provided as non-empty str") + sys.exit(1) + screen = screen.replace("${cluster.name}", clusterName) + if screen in screenConfigs: + print("ERROR: duplicate screen '%s'" % screen) + sys.exit(1) + + # graphs in screen + graphConfigs = [] + position = 1 + for graphJson in screenJson['graphs']: + # title + title = graphJson["title"] + if not isinstance(title, (str, unicode)) or len(title) == 0: + print( + "ERROR: bad json: [details][%s][graphs][%s]: [title] should be provided as non-empty str" % + (screen, title)) + sys.exit(1) + if title in graphConfigs: + print("ERROR: duplicate title '%s'" % title) + sys.exit(1) + + # endpoints + endpoints = graphJson["endpoints"] + newEndpoints = [] + for endpoint in endpoints: + if len(endpoint) != 0: + if endpoint.find("${cluster.name}") != -1: + newEndpoints.append( + endpoint.replace( + "${cluster.name}", + clusterName)) + elif endpoint.find("${for.each.master}") != -1: + newEndpoints += masterList + elif endpoint.find("${for.each.tserver}") != -1: + newEndpoints += tserverList + elif endpoint.find("${for.each.table}") != -1: + newEndpoints += tableList + else: + newEndpoints.append(endpoint) + newEndpoints = list(set(newEndpoints)) + if len(newEndpoints) == 0: + print( + "WARN: bad json: [details][%s][graphs][%s]: [endpoints] should be provided as non-empty list" % + (screen, title)) + + # counters + newCounters = [] + counters = graphJson["counters"] + if not isinstance(counters, dict) or len(counters) == 0: + print( + "ERROR: bad json: [details][%s][graphs][%s]: [counters] should be provided as non-empty list/dict" % + (screen, title)) + sys.exit(1) + for counter in templateJson[counters["template"] if counters.has_key("template") else templateName]: + newCounters.append( + counter.replace( + "${cluster.name}", + clusterName). replace( + "${level}", + counters["level"])) + if len(newCounters) == 0: + print( + "ERROR: bad json: [details][%s][graphs][%s]: [counters] should be provided as non-empty list" % + (screen, title)) + sys.exit(1) + + # graphType + graphType = graphJson["graph_type"] + if not isinstance(graphType, (str, unicode)): + print( + "ERROR: bad json: [details][%s][graphs][%s]: [graph_type] should be provided as non-empty list" % + (screen, title)) + sys.exit(1) + if graphType != "h" and graphType != "k" and graphType != "a": + print( + "ERROR: bad json: [details][%s][graphs][%s]: [graph_type] should be 'h' or 'k' or 'a'" % + (screen, title)) + sys.exit(1) + + # method + method = graphJson["method"] + if not isinstance(method, (str, unicode)): + print( + "ERROR: bad json: [details][%s][graphs][%s]: [method] should be provided as str" % + (screen, title)) + sys.exit(1) + if method != "" and method != "sum": + print( + "ERROR: bad json: [details][%s][graphs][%s]: [method] should be '' or 'sum'" % + (screen, title)) + sys.exit(1) + + # timespan + timespan = graphJson["timespan"] + if not isinstance(timespan, int) or timespan <= 0: + print( + "ERROR: bad json: [details][%s][graphs][%s]: [timespan] should be provided as positive int" % + (screen, title)) + sys.exit(1) + + graphConfig = {} + graphConfig["counters"] = newCounters + graphConfig["endpoints"] = newEndpoints + graphConfig["falcon_tags"] = "" + graphConfig["graph_type"] = graphType + graphConfig["method"] = method + graphConfig["position"] = position + graphConfig["timespan"] = timespan + graphConfig["title"] = title + graphConfigs.append(graphConfig) + + position += 1 + screenConfigs[screen] = graphConfigs + + return screenConfigs + + +# return: graphId +def create_graph(graphConfig): + url = falconServiceUrl + "/v1.0/dashboard/graph" + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.post(url, headers=headers, data=json.dumps(graphConfig)) + if r.status_code != 200: + print( + "ERROR: create_graph failed, graphTitle = \"%s\", status_code = %s, result:\n%s" % + (graphConfig["title"], r.status_code, r.text)) + sys.exit(1) + + j = json.loads(r.text) + if "id" not in j: + print( + "ERROR: create_graph failed, graphTitle = \"%s\", bad json result\n%s" % + (graphConfig["title"], r.text)) + sys.exit(1) + + graphId = j["id"] + print("INFO: create_graph succeed, graphTitle = \"%s\", graphId = %s" + % (graphConfig["title"], graphId)) + + # udpate graph position immediately + graphConfig["id"] = graphId + update_graph(graphConfig, "position") + + return graphId + + +# return: screen[] +def get_kudu_screens(scrid): + url = falconServiceUrl + "/v1.0/dashboard/screen/pid/" + str(scrid) + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.get(url, headers=headers) + if r.status_code != 200: + print( + "ERROR: get_kudu_screens failed, status_code = %s, result:\n%s" % + (r.status_code, r.text)) + sys.exit(1) + + j = json.loads(r.text) + + print("INFO: get_kudu_screens succeed, screenCount = %s" % len(j)) + return j + + +# return: graph[] +def get_screen_graphs(screenName, screenId): + url = falconServiceUrl + "/v1.0/dashboard/graph/screen/" + str(screenId) + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.get(url, headers=headers) + if r.status_code != 200: + print( + "ERROR: get_screen_graphs failed, screenName = %s, screenId = %s, status_code = %s, result:\n%s" % + (screenName, screenId, r.status_code, r.text)) + sys.exit(1) + + j = json.loads(r.text) + + print( + "INFO: get_screen_graphs succeed, screenName = %s, screenId = %s, graphCount = %s" % + (screenName, screenId, len(j))) + return j + + +# return: +def delete_graph(graphTitle, graphId): + url = falconServiceUrl + "/v1.0/dashboard/graph/" + str(graphId) + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.delete(url, headers=headers) + if r.status_code != 200 or r.text.find("delete success!") == -1: + print( + "ERROR: delete_graph failed, graphTitle = \"%s\", graphId = %s, status_code = %s, result:\n%s" % + (graphTitle, graphId, r.status_code, r.text)) + sys.exit(1) + + print( + "INFO: delete_graph succeed, graphTitle = \"%s\", graphId = %s" % + (graphTitle, graphId)) + + +# return: +def update_graph(graphConfig, updateReason): + url = falconServiceUrl + "/v1.0/dashboard/graph/" + str(graphConfig["id"]) + headers = { + "Cookie": "falconSessionId=" + sessionId + } + + r = requests.put(url, headers=headers, data=json.dumps(graphConfig)) + if r.status_code != 200: + print( + "ERROR: update_graph failed, graphTitle = \"%s\", graphId = %s, status_code = %s, result:\n%s" % + (graphConfig["title"], graphConfig["id"], r.status_code, r.text)) + sys.exit(1) + + j = json.loads(r.text) + if "id" not in j: + print( + "ERROR: update_graph failed, graphTitle = \"%s\", graphId = %s, bad json result\n%s" % + (graphConfig["title"], graphConfig["id"], r.text)) + sys.exit(1) + + print( + "INFO: update_graph succeed, graphTitle = \"%s\", graphId = %s, updateReason = \"%s changed\"" % + (graphConfig["title"], graphConfig["id"], updateReason)) + + +# return: bool, reason +def is_equal(graph1, graph2): + if graph1["title"] != graph2["title"]: + return False, "title" + if graph1["graph_type"] != graph2["graph_type"]: + return False, "graph_type" + if graph1["method"] != graph2["method"]: + return False, "method" + if graph1["position"] != graph2["position"]: + return False, "position" + if graph1["timespan"] != graph2["timespan"]: + return False, "timespan" + endpoints1 = graph1["endpoints"] + endpoints2 = graph2["endpoints"] + if len(endpoints1) != len(endpoints2): + return False, "endpoints" + for endpoint in endpoints1: + if endpoint not in endpoints2: + return False, "endpoints" + counters1 = graph1["counters"] + counters2 = graph2["counters"] + if len(counters1) != len(counters2): + return False, "counters" + for counter in counters1: + if counter not in counters2: + return False, "counters" + return True, "" + + +def create_screen_and_graphs(screenName, scrid, graphConfigs): + + # create screen + screenId = create_screen(screenName, scrid) + for graphConfig in graphConfigs: + graphConfig["screen_id"] = screenId + create_graph(graphConfig) + print("INFO: %s graphs created for %s" % (len(graphConfigs), screenName)) + + +def update_screen_and_graphs(screenName, screenId, graphConfigs): + oldGraphConfigs = get_screen_graphs(screenName, screenId) + if oldGraphConfigs is None: + print( + "ERROR: screen '%s' not exit, please create it first" % + clusterName) + sys.exit(1) + + # list -> dict + oldGraphConfigsDict = {} + newGraphConfigsDict = {} + for graph in oldGraphConfigs: + oldGraphConfigsDict[graph["title"]] = graph + for graph in graphConfigs: + newGraphConfigsDict[graph["title"]] = graph + + deleteConfigList = [] + createConfigList = [] + updateConfigList = [] + for graph in oldGraphConfigs: + if not graph["title"] in newGraphConfigsDict: + deleteConfigList.append((graph["title"], graph["graph_id"])) + for graph in graphConfigs: + if not graph["title"] in oldGraphConfigsDict: + graph["screen_id"] = screenId + createConfigList.append(graph) + else: + oldGraph = oldGraphConfigsDict[graph["title"]] + equal, reason = is_equal(graph, oldGraph) + if not equal: + graph["id"] = oldGraph["graph_id"] + graph["screen_id"] = screenId + updateConfigList.append((graph, reason)) + + for graphTitle, graphId in deleteConfigList: + delete_graph(graphTitle, graphId) + for graph in createConfigList: + create_graph(graph) + for graph, reason in updateConfigList: + update_graph(graph, reason) + + print("INFO: %d graphs deleted, %d graphs created, %d graphs updated" % + (len(deleteConfigList), len(createConfigList), len(updateConfigList))) + + +if __name__ == '__main__': + if serviceAccount == "" or serviceSeedMd5 == "": + print( + "ERROR: please set 'serviceAccount' and 'serviceSeedMd5' in %s" % + sys.argv[0]) + sys.exit(1) + + if len(sys.argv) != 7: + print( + "USAGE: python %s <cluster_name> <template_name> <screen_template_file> <master_list_file> <tserver_list_file> <table_list_file>" % + sys.argv[0]) + sys.exit(1) + + clusterName = sys.argv[1] + templateName = sys.argv[2] + screenTemplateFile = sys.argv[3] + masterListFile = sys.argv[4] + tserverListFile = sys.argv[5] + tableListFile = sys.argv[6] + + login() + + for scrid, scrNames in screenIdList.items(): + oldKuduScreens = get_kudu_screens(scrid) + oldScreenName2Id = {} + screenConfigs = prepare_screen_config( + clusterName, + templateName, + screenTemplateFile, + tableListFile, + masterListFile, + tserverListFile) + for oldScreen in oldKuduScreens: + oldScreenName2Id[oldScreen['name']] = oldScreen['id'] + if scrid == KUDU_TABLES_ID: + for scrName in scrNames: + screenName = clusterName + " " + scrName + graphConfigs = screenConfigs[screenName] + if screenName not in oldScreenName2Id: + # create screen + create_screen_and_graphs(screenName, scrid, graphConfigs) + else: + # update screen + screenId = oldScreenName2Id[screenName] + update_screen_and_graphs( + screenName, screenId, graphConfigs) + else: + screenName = clusterName + " " + scrNames + graphConfigs = screenConfigs[screenName] + if screenName not in oldScreenName2Id: + # create screen + create_screen_and_graphs(screenName, scrid, graphConfigs) + else: + # update screen + screenId = oldScreenName2Id[screenName] + update_screen_and_graphs(screenName, screenId, graphConfigs) + + logout() diff --git a/src/kudu/scripts/kudu_falcon_screen.sh b/src/kudu/scripts/kudu_falcon_screen.sh new file mode 100755 index 0000000..046352b --- /dev/null +++ b/src/kudu/scripts/kudu_falcon_screen.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +PID=$$ +BASE_DIR="$( cd "$( dirname "$0" )" && pwd )" +KUDU=${KUDU_HOME}/kudu +COLLECTOR=${KUDU_HOME}/kudu_collector +if [[ ! -f ${KUDU} || ! -f ${COLLECTOR} ]]; then + echo "ERROR: ${KUDU} or ${COLLECTOR} not found" + exit 1 +fi +KUDURC=${KUDU_CONFIG}/kudurc +if [[ ! -f ${KUDURC} ]]; then + echo "ERROR: ${KUDURC} not found" + exit 1 +fi + +function usage() { +cat << EOF +This tool is for update falcon screen for specified kudu cluster. +USAGE: $0 <cluster_name> [table_count] [metrics_template] + cluster_name Cluster name operated on, should be configurated in $KUDU_CONFIG/kudurc + table_count An indicator of how many tables will be monitored, actual monitored table count is in range [table_count, 3*table_count] + metrics_template Which metric template will be used, 'simple' or 'full' +EOF +} + +if [[ $# -lt 1 ]] +then + usage + exit 1 +fi + +CLUSTER=$1 +TABLE_COUNT=9999 +if [[ $# -ge 2 ]] +then + TABLE_COUNT=$2 +fi + +TEMPLATE_NAME='full' +if [[ $# -ge 3 ]] +then + TEMPLATE_NAME=$3 +fi +if [[ "${TEMPLATE_NAME}"x != "simple"x && "${TEMPLATE_NAME}"x != "full"x ]] +then + usage + exit 1 +fi + +echo "UID: ${UID}" +echo "PID: ${PID}" +echo "cluster: ${CLUSTER}" +echo "top n table: ${TABLE_COUNT}" +echo "metric template: ${TEMPLATE_NAME}" +echo "Start time: `date`" +ALL_START_TIME=$((`date +%s`)) +echo + +# get master list +${KUDU} master list @${CLUSTER} -format=space | awk -F' |:' '{print $2}' | sort -n &>/tmp/${UID}.${PID}.kudu.master.list +if [[ $? -ne 0 ]]; then + echo "`kudu master list @${CLUSTER} -format=space` failed" + exit $? +fi + +MASTER_COUNT=`cat /tmp/${UID}.${PID}.kudu.master.list | wc -l` +if [[ ${MASTER_COUNT} -eq 0 ]]; then + echo "ERROR: master list is empty, please check the cluster ${CLUSTER}" + exit -1 +fi + +# get tserver list +${KUDU} tserver list @${CLUSTER} -format=space | awk -F' |:' '{print $2}' | sort -n &>/tmp/${UID}.${PID}.kudu.tserver.list +if [[ $? -ne 0 ]]; then + echo "`kudu tserver list @${CLUSTER} -format=space` failed" + exit $? +fi + +TSERVER_COUNT=`cat /tmp/${UID}.${PID}.kudu.tserver.list | wc -l` +if [[ ${TSERVER_COUNT} -eq 0 ]]; then + echo "ERROR: tserver list is empty, please check the cluster ${CLUSTER}" + exit 1 +fi + +function parse_yaml() { + python -c "import yaml;print(yaml.load(open('$1').read(), Loader=yaml.FullLoader)['clusters_info']['$2']['master_addresses'])" +} +MASTERS=$(parse_yaml ${KUDURC} ${CLUSTER}) + +# get table list +${COLLECTOR} -collector_master_addrs=${MASTERS} -collector_cluster_name=${CLUSTER} -collector_report_method=local -collector_metrics=bytes_flushed,on_disk_size,scanner_bytes_returned -log_dir=./log > /tmp/${UID}.${PID}.kudu.metric_table_value +if [[ $? -ne 0 ]]; then + echo "ERROR: ${COLLECTOR} execute failed" + exit 1 +fi + +cat /tmp/${UID}.${PID}.kudu.metric_table_value | egrep "^table bytes_flushed " | sort -rnk4 | head -n ${TABLE_COUNT} | awk '{print $3}' > /tmp/${UID}.${PID}.kudu.top.bytes_flushed +cat /tmp/${UID}.${PID}.kudu.metric_table_value | egrep "^table on_disk_size " | sort -rnk4 | head -n ${TABLE_COUNT} | awk '{print $3}' > /tmp/${UID}.${PID}.kudu.top.on_disk_size +cat /tmp/${UID}.${PID}.kudu.metric_table_value | egrep "^table scanner_bytes_returned " | sort -rnk4 | head -n ${TABLE_COUNT} | awk '{print $3}' > /tmp/${UID}.${PID}.kudu.top.scanner_bytes_returned +cat /tmp/${UID}.${PID}.kudu.top.* | sort -n | uniq > /tmp/${UID}.${PID}.kudu.table.list +echo "total `wc -l /tmp/${UID}.${PID}.kudu.table.list | awk '{print $1}'` tables to monitor" +echo -e "\033[32m Please set the following one line to the kudu collector's \`collector_attributes\` argument manually\033[0m" +echo -n "table_name:" +awk BEGIN{RS=EOF}'{gsub(/\n/,",");print}' /tmp/${UID}.${PID}.kudu.table.list +echo "" + +python ${BASE_DIR}/falcon_screen.py ${CLUSTER} ${TEMPLATE_NAME} ${BASE_DIR}/falcon_screen.json /tmp/${UID}.${PID}.kudu.master.list /tmp/${UID}.${PID}.kudu.tserver.list /tmp/${UID}.${PID}.kudu.table.list +if [[ $? -ne 0 ]]; then + echo "ERROR: falcon screen operate failed" + exit 1 +fi + +echo +echo "Finish time: `date`" +ALL_FINISH_TIME=$((`date +%s`)) +echo "Falcon screen operate done, elapsed time is $((ALL_FINISH_TIME - ALL_START_TIME)) seconds." + +rm -f /tmp/${UID}.${PID}.kudu.* &>/dev/null diff --git a/src/kudu/scripts/kudu_utils.py b/src/kudu/scripts/kudu_utils.py new file mode 100755 index 0000000..8dc5dfa --- /dev/null +++ b/src/kudu/scripts/kudu_utils.py @@ -0,0 +1,106 @@ +#! /usr/bin/env python +# coding=utf-8 + +import datetime +import dateutil.relativedelta +import logging +from logging.handlers import RotatingFileHandler +import os +import requests + +LOG = logging.getLogger() +g_time = datetime.datetime.now() + + +def init_log(): + if not os.path.exists('log/'): + os.makedirs('log') + handler = RotatingFileHandler('log/kudu.log', + mode='a', + maxBytes=100*1024*1024, + backupCount=10) + handler.setFormatter( + logging.Formatter( + fmt='%(asctime)s [%(thread)d] [%(levelname)s] %(filename)s:%(lineno)d %(message)s', + datefmt='%Y-%m-%d %H:%M:%S')) + LOG.addHandler(handler) + LOG.setLevel(logging.INFO) + + +def make_dir(path): + try: + os.mkdir(path) + except OSError, e: + if e.errno != os.errno.EEXIST: + raise + pass + + +def script_path(): + return os.path.split(os.path.realpath(__file__))[0] + + +def get_year(last_month): + time = g_time + if last_month: + time += dateutil.relativedelta.relativedelta(months=-1) + return time.strftime('%Y') + + +def get_month(last_month): + time = g_time + if last_month: + time += dateutil.relativedelta.relativedelta(months=-1) + return time.strftime('%m') + + +def prepare_pricing_month_path(last_month=False): + month_base_path = script_path() + '/year=' + get_year(last_month) + make_dir(month_base_path) + month_base_path += '/month=' + get_month(last_month) + make_dir(month_base_path) + data_path = month_base_path + '/data' + make_dir(data_path) + return month_base_path + '/', data_path + '/' + + +def get_year_month(last_month): + return get_year(last_month) + '-' + get_month(last_month) + + +def get_date(): + time = g_time + return time.strftime('%Y-%m-%d') + + +def get_date_list(start, end, step=1, format="%Y-%m-%d"): + strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime + days = (strptime(end, format) - strptime(start, format)).days + return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in xrange(0, days, step)] + + +# method: +# append: 追加写入账单 +# reload: 清空数据并重新写入账单 +def upload_usage_data(method, filename): + LOG.info('Start to report %s by %s mode' % (filename, method)) + report_url = "http://production-cost.api.xiaomi.net/api/v1/data/upload" + token = "c2534683e5504ab4850c49873a36de61" + url = "%s?sk=%s&method=%s" % (report_url, token, method) + with open(filename, "rb") as f: + resp = requests.post(url, files={"file": f}) + if resp.status_code == 200: + # 成功 + details = resp.json() + if details['code'] == 0: + LOG.info('Succeed to report %s by %s mode' % (filename, method)) + else: + LOG.fatal('Failed to report %s by %s mode, details: %s' % (filename, method, str(details).decode("unicode-escape"))) + else: + # 失败 + LOG.fatal('Report failed, code %d' % resp.status_code) + + +g_script_path = script_path() +os.environ['KUDU_CONFIG'] = g_script_path +init_log() diff --git a/src/kudu/scripts/kudurc b/src/kudu/scripts/kudurc new file mode 100644 index 0000000..6b28831 --- /dev/null +++ b/src/kudu/scripts/kudurc @@ -0,0 +1,69 @@ +clusters_info: + c3prc-hadoop: + olap_version: 2 + region: chnbj-idc + charge_type: share + instance: SSD + special_db_org: + mifi: CL3894 + b2c: CL5281 + master_addresses: c3-hadoop-kudu-prc-ct01.bj:18600,c3-hadoop-kudu-prc-ct02.bj:18600,c3-hadoop-kudu-prc-ct03.bj:18600 + zjyprc-hadoop: + olap_version: 2 + region: chnbj-idc + charge_type: share + instance: SSD + master_addresses: zjy-hadoop-prc-ct01.bj:14000,zjy-hadoop-prc-ct02.bj:14000,zjy-hadoop-prc-ct03.bj:14000 + zjyprc-analysis: + olap_version: 2 + region: chnbj-idc + charge_type: exclusive + instance: SSD + special_db_org: + kudu_demo: CL18605 + ga_test: CL18605 + master_addresses: zjy-hadoop-prc-ct01.bj:15000,zjy-hadoop-prc-ct02.bj:15000,zjy-hadoop-prc-ct03.bj:15000 + azmbcommonprc-hadoop: + olap_version: 2 + region: indmb-aws + charge_type: share + instance: SSD + master_addresses: mb1-hadoop-kudu-prc-ct01.awsind:14000,mb2-hadoop-kudu-prc-ct02.awsind:14000,mb3-hadoop-kudu-prc-ct03.awsind:14000 + ksmosprc-xiaomi: + olap_version: 2 + region: rusmos-ks + charge_type: share + instance: SSD + master_addresses: mos1-hadoop-kudu-prc-ct01.ksru:14000,mos1-hadoop-kudu-prc-ct02.ksru:14000,mos1-hadoop-kudu-prc-ct03.ksru:14000 + alsgprc-xiaomi: + olap_version: 2 + region: sg-ali + charge_type: share + instance: SSD + special_db_org: + b2c: CL5281 + master_addresses: sgp1-hadoop-kudu-prc-ct01.alisgp:15000,sgp2-hadoop-kudu-prc-ct02.alisgp:15000,sgp2-hadoop-kudu-prc-ct03.alisgp:15000 + tjwqstaging-hdd: + olap_version: 2 + region: chnwq-ks + charge_type: public_share + instance: SSD + master_addresses: tj1-hadoop-kudu-tst-ct01.kscn:18600,tj1-hadoop-kudu-tst-ct02.kscn:18600,tj1-hadoop-kudu-tst-ct03.kscn:18600 + tjwqtst-dev: + olap_version: 2 + region: chnwq-ks + charge_type: public_share + instance: SSD + master_addresses: tj1-hadoop-kudu-tst-ct01.kscn:15000,tj1-hadoop-kudu-tst-ct02.kscn:15000,tj1-hadoop-kudu-tst-ct03.kscn:15000 + c3tst-test: + olap_version: 2 + region: chnbj-idc + charge_type: public_share + instance: HDD + master_addresses: c3-hadoop-kudu-prc-ct01.bj:15000,c3-hadoop-kudu-prc-ct02.bj:15000,c3-hadoop-kudu-prc-ct03.bj:15000 + c3tst-dev: + olap_version: 2 + region: chnbj-idc + charge_type: public_share + instance: HDD + master_addresses: c3-hadoop-kudu-prc-ct01.bj:18000,c3-hadoop-kudu-prc-ct02.bj:18000,c3-hadoop-kudu-prc-ct03.bj:18000 diff --git a/src/kudu/scripts/minos_control_server.py b/src/kudu/scripts/minos_control_server.py new file mode 100755 index 0000000..1d74878 --- /dev/null +++ b/src/kudu/scripts/minos_control_server.py @@ -0,0 +1,225 @@ +#! /usr/bin/env python +# coding=utf-8 + +# A tool for restarting servers, typically to restart tservers in kudu cluster + +import sys +import commands +import time +import json +import re +import os +import subprocess + +cluster = '' # cluster name in minos config +job = 'tablet_server' # job name in minos config +operate = 'stop' # minos operate type, currently support: restart, stop, rolling_update +tasks = range(0, 5) # an int element list, e.g. '[n]' for a single node, or 'range(m, n)' for several nodes +flags = '' # minos flags, e.g. '--update_config' for updating config +known_unhealth_nodes = set() +#known_unhealth_nodes.add() # it's ok to add some known unhealth nodes, e.g. some already stoped servers +default_follower_unavailable_considered_failed_sec = 300 # default value of follower_unavailable_considered_failed_sec +rebalance_cluster_after_operation = True # whether to rebalance cluster after operation + +def get_minos_type(cluster_name): + minos_type = 'null' + minos_clinet_path = None + + minos_config_file = os.getenv('MINOS_CONFIG_FILE') + minos_clinet_dir = os.getenv('MINOS_CLIENT_DIR') + if minos_config_file is not None and minos_clinet_dir is not None: + minos_config_dir = os.path.dirname(minos_config_file) + minos_config = '%s/xiaomi-config/conf/kudu/kudu-%s.cfg' % (minos_config_dir, cluster_name) + if os.path.exists(minos_config) and os.path.exists(minos_clinet_dir + '/deploy'): + return 'minos1.0', minos_clinet_dir + + minos2_config_file = os.getenv('MINOS2_CONFIG_FILE') + minos2_clinet_dir = os.getenv('MINOS2_CLIENT_DIR') + if minos2_config_file is not None and minos2_clinet_dir is not None: + minos2_config_dir = os.path.dirname(minos2_config_file) + minos2_config = '%s/xiaomi-config/conf/kudu/kudu-%s.yaml' % (minos2_config_dir, cluster_name) + if os.path.exists(minos2_config) and os.path.exists(minos2_clinet_dir + '/deploy'): + return 'minos2.0', minos2_clinet_dir + + return minos_type, minos_clinet_path + +def get_host(host_port): + return host_port.split(':')[0] + +def is_cluster_health(): + status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -consensus=false' + ' -ksck_format=json_compact -color=never' + ' -sections=MASTER_SUMMARIES,TSERVER_SUMMARIES,TABLE_SUMMARIES' + ' 2>/dev/null' + % cluster) + unhealth_nodes = set() + if status == 0 or status == 256: + ksck_info = json.loads(output) + for master in ksck_info['master_summaries']: + if master['health'] != 'HEALTHY': + unhealth_nodes.add(get_host(master['address'])) + for tserver in ksck_info['tserver_summaries']: + if tserver['health'] != 'HEALTHY': + unhealth_nodes.add(get_host(tserver['address'])) + if 'table_summaries' in ksck_info: + for table in ksck_info['table_summaries']: + if table['health'] != 'HEALTHY': + unhealth_nodes.add(table['name']) + else: + unhealth_nodes.add('mockone') + + return unhealth_nodes + + +def check_parameter(message, parameter, allow_empty = False): + print(message % parameter) + answer = sys.stdin.readline().strip('\n').lower() + if answer != 'y' and answer != '': + exit() + if (not allow_empty and + (not parameter or + (isinstance(parameter, list) and len(parameter) == 0) or + (isinstance(parameter, str) and parameter.strip() == ''))): + print(time_header() + 'You should provide a valid parameter') + exit() + + +def wait_cluster_health(): + print(time_header() + 'Wait cluster to be health ...') + nodes = is_cluster_health() + health = (len(nodes) == 0) + while not health: + health = True + for node in nodes: + if node not in known_unhealth_nodes: + health = False + print(time_header() + 'Unhealthy node: ' + node) + time.sleep(5) + nodes = is_cluster_health() + break + + +def parse_node_from_minos_output(output, job): + host = '' + regex = re.compile('[a-zA-Z\s]*[tT]ask [0-9]+ of (%s) on ([0-9a-z-.]+)\(0\).+' % job) + match = regex.search(output) + if match is not None: + host = match.group(2) + else: + print(time_header() + 'Fail to parse node from minos output') + exit() + return host + + +def time_header(): + return time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()) + + +def get_tservers_info(): + tservers_info = dict() + status, output = commands.getstatusoutput('${KUDU_HOME}/kudu tserver list @%s -format=json' + % cluster) + if status == 0 or status == 256: + tservers_info = json.loads(output) + return tservers_info + + +def get_tablet_server_info(hostname, tservers_info): + rpc_address = '' + uuid = '' + for tserver in tservers_info: + if hostname in tserver['rpc-addresses']: + rpc_address = tserver['rpc-addresses'] + uuid = tserver['uuid'] + break + return rpc_address, uuid + + +def set_flag(rpc_address, seconds): + cmd = ('${KUDU_HOME}/kudu tserver set_flag %s follower_unavailable_considered_failed_sec %s' + % (rpc_address, seconds)) + status, output = commands.getstatusoutput(cmd) + + +def rebalance_cluster(blacklist_tserver_uuid): + ignored_tservers_uuid = set() + for node in known_unhealth_nodes: + rpc_address, uuid = get_tablet_server_info(node, tservers_info) + ignored_tservers_uuid.add(uuid) + cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s' + % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid)))) + p = subprocess.Popen(cmd, stdout = subprocess.PIPE, shell=True) + for line in iter(p.stdout.readline, b''): + print line + p.stdout.close() + p.wait() + + +check_parameter('You will operate on cluster: %s? (y/n)', cluster) +minos_type, minos_client_path = get_minos_type(cluster) +if minos_type == 'null' or minos_client_path is None: + print("You should set these environment variables:\n* MINOS_CONFIG_FILE\n* MINOS_CLIENT_DIR\n" + + "* MINOS2_CONFIG_FILE\n* MINOS2_CLIENT_DIR\nand check cluster name") + exit() +check_parameter('The minos type is: %s? (y/n)', minos_type) +check_parameter('The minos client path is: %s? (y/n)', minos_client_path) +check_parameter('You will operate on job: %s? (y/n)', job) +check_parameter('You will operate on tasks: %s? (y/n)', tasks) +check_parameter('The operate is: %s? (y/n)', operate) +if operate == 'rolling_update' and flags.find('--update_package') == -1: + flags += ' --update_package' + if minos_type == 'minos2.0' and flags.find('--confirm_install') == -1: + flags += ' --confirm_install' +check_parameter('The extra flags are: %s? (y/n)', flags, True) +check_parameter('The known unhealth nodes are: %s? (y/n)', ','.join(known_unhealth_nodes), True) +check_parameter('The default value of follower_unavailable_considered_failed_sec is: %s? (y/n)', + default_follower_unavailable_considered_failed_sec, True) +check_parameter('You will rebalance cluster after operation: %s? (y/n)', rebalance_cluster_after_operation, True) + +tservers_info = get_tservers_info() +wait_cluster_health() + +if 'tablet_server' in job and operate in ['restart', 'rolling_update']: + for tserver in tservers_info: + set_flag(tserver['rpc-addresses'], 7200) + +for task in tasks: + if not isinstance(task, int): + print(time_header() + '%s is not a valid integer task id' % str(task)) + exit() + + if 'tablet_server' in job: + cmd = ('%s/deploy show kudu %s --job %s --task %d' + % (minos_client_path, cluster, job, task)) + status, output = commands.getstatusoutput(cmd) + print(output) + hostname = parse_node_from_minos_output(output, job) + rpc_address, uuid = get_tablet_server_info(hostname, tservers_info) + if operate == 'stop': + # migrate replicas on tserver + rebalance_cluster(uuid) + + print(time_header() + 'Start to operate on task %d' % task) + cmd = ('%s/deploy %s kudu %s --job %s --task %d --skip_confirm %s' + % (minos_client_path, operate, cluster, job, task, flags)) + status, output = commands.getstatusoutput(cmd) + print(output) + if operate == 'stop': + known_unhealth_nodes.add(parse_node_from_minos_output(output, job)) + + wait_cluster_health() + + if 'tablet_server' in job and operate in ['restart', 'rolling_update']: + set_flag(rpc_address, 7200) + + print(time_header() + '==========================') + time.sleep(10) + +if 'tablet_server' in job and operate in ['restart', 'rolling_update']: + for tserver in tservers_info: + set_flag(tserver['rpc-addresses'], default_follower_unavailable_considered_failed_sec) + +if rebalance_cluster_after_operation: + rebalance_cluster('') + +print(time_header() + 'Complete sucessfully')
