Repository: incubator-hawq Updated Branches: refs/heads/master 127cac3e2 -> bee013732
HAWQ-324. Add minirepro management utility Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/bee01373 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/bee01373 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/bee01373 Branch: refs/heads/master Commit: bee0137323c275a634ff7104a4c5640a4e6114a1 Parents: 127cac3 Author: Haisheng Yuan <[email protected]> Authored: Thu Jan 7 13:33:02 2016 -0800 Committer: Haisheng Yuan <[email protected]> Committed: Mon Jan 11 14:05:03 2016 -0800 ---------------------------------------------------------------------- src/backend/catalog/Makefile | 2 + src/backend/catalog/gp_toolkit.sql.in | 20 ++ src/bin/Makefile | 2 +- src/bin/gpoptutils/Makefile | 12 + src/bin/gpoptutils/gpoptutils.c | 140 ++++++++++++ tools/bin/minirepro | 343 +++++++++++++++++++++++++++++ 6 files changed, 518 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/backend/catalog/Makefile ---------------------------------------------------------------------- diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index e066cff..54d26cb 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -51,6 +51,7 @@ JETPACK_PREFIX = $(JETPACK_SCHEMA).$(JETPACK_OBJECT_PREFIX) JETPACK_PREFIX_AUX = $(JETPACK_SCHEMA).__$(JETPACK_OBJECT_PREFIX) WORKFILE_PATHNAME = $$libdir\/gp_workfile_mgr SESSION_STATE_PATHNAME = $$libdir\/gp_session_state +OPTUTILS_PATHNAME = $$libdir\/gpoptutils gp_toolkit.sql: gp_toolkit.sql.in sed \ @@ -60,6 +61,7 @@ gp_toolkit.sql: gp_toolkit.sql.in -e 's/%%JETPACK_PREFIX_AUX%%/$(JETPACK_PREFIX_AUX)/g' \ -e 's/%%WORKFILE_MODULE%%/$(WORKFILE_PATHNAME)/g' \ -e 's/%%SESSION_STATE_MODULE%%/$(SESSION_STATE_PATHNAME)/g' \ + -e 's/%%OPTUTILS_MODULE%%/$(OPTUTILS_PATHNAME)/g' \ -e 's/ / /g' \ $< > $@ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/backend/catalog/gp_toolkit.sql.in ---------------------------------------------------------------------- diff --git a/src/backend/catalog/gp_toolkit.sql.in b/src/backend/catalog/gp_toolkit.sql.in index 86269e5..ccd3212 100644 --- a/src/backend/catalog/gp_toolkit.sql.in +++ b/src/backend/catalog/gp_toolkit.sql.in @@ -2057,6 +2057,26 @@ LANGUAGE SQL; REVOKE ALL ON FUNCTION %%JETPACK_PREFIX%%workfile_cache_clear() FROM public; -------------------------------------------------------------------------------- +-- @function: +-- %%JETPACK_PREFIX%%dump_query_oids(text) +-- +-- @in: +-- text - SQL text +-- @out: +-- text - serialized json string of oids +-- +-- @doc: +-- Dump query oids for a given SQL text +-- +-------------------------------------------------------------------------------- + +CREATE FUNCTION %%JETPACK_PREFIX%%dump_query_oids(text) +RETURNS text +AS '%%OPTUTILS_MODULE%%', 'gp_dump_query_oids' LANGUAGE C STRICT; + +GRANT EXECUTE ON FUNCTION %%JETPACK_PREFIX%%dump_query_oids(text) TO public; + +-------------------------------------------------------------------------------- -- Finalize install COMMIT; http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/Makefile ---------------------------------------------------------------------- diff --git a/src/bin/Makefile b/src/bin/Makefile index 9d4919d..04d412d 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -16,7 +16,7 @@ include $(top_builddir)/src/Makefile.global DIRS = initdb ipcclean pg_ctl pg_dump pgbench \ psql scripts pg_config pg_controldata pg_resetxlog \ gpfilesystem/hdfs gpmirrortransition gpupgrade \ - gpfusion gp_workfile_mgr gpcheckhdfs gpfdist + gpfusion gp_workfile_mgr gpcheckhdfs gpfdist gpoptutils all install installdirs uninstall distprep clean distclean maintainer-clean: @for dir in $(DIRS); do echo "build $$dir"; $(MAKE) -C $$dir $@ || exit; done http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/gpoptutils/Makefile ---------------------------------------------------------------------- diff --git a/src/bin/gpoptutils/Makefile b/src/bin/gpoptutils/Makefile new file mode 100644 index 0000000..3cc8be8 --- /dev/null +++ b/src/bin/gpoptutils/Makefile @@ -0,0 +1,12 @@ +MODULE_big = gpoptutils +OBJS = gpoptutils.o + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = src/bin/gpoptutils +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/gpoptutils/gpoptutils.c ---------------------------------------------------------------------- diff --git a/src/bin/gpoptutils/gpoptutils.c b/src/bin/gpoptutils/gpoptutils.c new file mode 100644 index 0000000..d1ae351 --- /dev/null +++ b/src/bin/gpoptutils/gpoptutils.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2015 Pivotal Inc. All Rights Reserved + * + * --------------------------------------------------------------------- + * + * The dynamically linked library created from this source can be reference by + * creating a function in psql that references it. For example, + * + * CREATE FUNCTION gp_dump_query_oids(text) + * RETURNS text + * AS '$libdir/gpoptutils', 'gp_dump_query_oids' + * LANGUAGE C STRICT; + */ + +#include "postgres.h" +#include "funcapi.h" +#include "utils/builtins.h" +#include "gpopt/utils/nodeutils.h" +#include "rewrite/rewriteHandler.h" +#include "c.h" + +extern +List *pg_parse_and_rewrite(const char *query_string, Oid *paramTypes, int iNumParams); + +extern +List *QueryRewrite(Query *parsetree); + +static +Query *parseSQL(char *szSqlText); + +static +void traverseQueryRTEs(Query *pquery, HTAB *phtab, StringInfoData *buf); + +Datum gp_dump_query_oids(PG_FUNCTION_ARGS); + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +PG_FUNCTION_INFO_V1(gp_dump_query_oids); + +/* + * Parse a query given as SQL text. + */ +static Query *parseSQL(char *sqlText) +{ + Assert(sqlText); + + List *queryTree = pg_parse_and_rewrite(sqlText, NULL, 0); + + if (1 != list_length(queryTree)) + { + elog(ERROR, "Cannot parse query. " + "Please make sure the input contains a single valid query. \n%s", sqlText); + } + + Query *query = (Query *) lfirst(list_head(queryTree)); + + return query; +} + +static void traverseQueryRTEs + ( + Query *pquery, + HTAB *phtab, + StringInfoData *buf + ) +{ + ListCell *plc; + bool found; + foreach (plc, pquery->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(plc); + + switch (rte->rtekind) + { + case RTE_RELATION: + { + hash_search(phtab, (void *)&rte->relid, HASH_ENTER, &found); + if (!found) + { + if (0 != buf->len) + appendStringInfo(buf, "%s", ", "); + appendStringInfo(buf, "%u", rte->relid); + } + } + break; + case RTE_SUBQUERY: + traverseQueryRTEs(rte->subquery, phtab, buf); + break; + default: + break; + } + } +} + +/* + * Function dumping dependent relation oids for a given SQL text + */ +Datum +gp_dump_query_oids(PG_FUNCTION_ARGS) +{ + char *szSqlText = text_to_cstring(PG_GETARG_TEXT_P(0)); + + Query *pquery = parseSQL(szSqlText); + if (CMD_UTILITY == pquery->commandType && T_ExplainStmt == pquery->utilityStmt->type) + { + Query *pqueryExplain = ((ExplainStmt *)pquery->utilityStmt)->query; + List *plQueryTree = QueryRewrite(pqueryExplain); + Assert(1 == list_length(plQueryTree)); + pquery = (Query *) lfirst(list_head(plQueryTree)); + } + + typedef struct OidHashEntry + { + Oid key; + bool value; + } OidHashEntry; + HASHCTL ctl; + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(OidHashEntry); + ctl.hash = oid_hash; + + StringInfoData buf; + initStringInfo(&buf); + + HTAB *phtab = hash_create("relid hash table", 100, &ctl, HASH_ELEM | HASH_FUNCTION); + traverseQueryRTEs(pquery, phtab, &buf); + hash_destroy(phtab); + + StringInfoData str; + initStringInfo(&str); + appendStringInfo(&str, "{\"relids\": [%s]}", buf.data); + + text *result = cstring_to_text(str.data); + pfree(buf.data); + pfree(str.data); + + PG_RETURN_TEXT_P(result); +} http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/tools/bin/minirepro ---------------------------------------------------------------------- diff --git a/tools/bin/minirepro b/tools/bin/minirepro new file mode 100755 index 0000000..7238cd3 --- /dev/null +++ b/tools/bin/minirepro @@ -0,0 +1,343 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# +# Dump minimal schema objects and statistics for a given query +# +import os, sys, re, json, platform, subprocess +from optparse import OptionParser +from pygresql import pgdb +from datetime import datetime + +prog_version = '%prog 1.0' +PATH_PREFIX = '/tmp/' +sysnslist = "('pg_toast', 'pg_bitmapindex', 'pg_catalog', 'information_schema', 'hawq_toolkit', 'pg_aoseg')" +pgoptions = '-c gp_session_role=utility' + +class MRQuery(object): + def __init__(self): + self.relations = {} + self.funcs = {} + self.relids = [] + +def E(query_str): + return pgdb.escape_string(query_str) + +def generate_timestamp(): + timestamp = datetime.now() + return timestamp.strftime("%Y%m%d%H%M%S") + +def result_iter(cursor, arraysize=1000): + 'An iterator that uses fetchmany to keep memory usage down' + while True: + results = cursor.fetchmany(arraysize) + if not results: + break + for result in results: + yield result + +def get_version(cursor): + query = "select version()" + try: + cursor.execute(query) + vals = cursor.fetchone() + return vals[0] + except pgdb.DatabaseError as e: + sys.stderr.write('\nError while trying to find HAWQ/GPDB version.\n\n' + str(e) + '\n\n') + sys.exit(1) + +def parse_cmd_line(): + p = OptionParser(usage='Usage: %prog <database> [options]', version=prog_version, conflict_handler="resolve") + p.add_option('-?', '--help', action='help', help='Show this help message and exit') + p.add_option('-h', '--host', action='store', + dest='host', help='Specify a remote host') + p.add_option('-p', '--port', action='store', + dest='port', help='Specify a port other than 5432') + p.add_option('-U', '--user', action='store', dest='user', + help='Connect as someone other than current user') + p.add_option('-q', action='store', dest='query_file', + help='file name that contains the query') + p.add_option('-f', action='store', dest='output_file', + help='minirepro output file name') + return p + +def dump_query(cursor, query_file): + print "Extracting metadata from query file %s ..." % query_file + + with open(query_file, 'r') as query_f: + sql_text = query_f.read() + query = "select hawq_toolkit.hawq_dump_query_oids('%s')" % E(sql_text) + + try: + cursor.execute(query) + vals = cursor.fetchone() + return vals[0] + except pgdb.DatabaseError as e: + sys.stderr.writelines('\nError while running hawq_toolkit.hawq_dump_query_oids(text).\nPlease make sure ' \ + 'the function is installed and the query file contains single valid query.\n\n' + str(e) + '\n\n') + sys.exit(1) + +# relation oid will be extracted from the dump string +def parse_oids(cursor, json_oids): + result = MRQuery() + oids = json.loads(json_oids)['relids'] + result.relids = [str(x) for x in oids] + + if len(result.relids) == 0: + result.relids.append('0') + oid_str = ','.join(result.relids) + cat_query = "SELECT nspname, relname FROM pg_class, pg_namespace WHERE pg_class.relnamespace = pg_namespace.oid " \ + "AND pg_class.oid IN (%s)" % oid_str + + cursor.execute(cat_query) + + for vals in result_iter(cursor): + schm, relname = vals[0], vals[1] + if schm not in result.relations: + result.relations[schm] = [relname] + else: + result.relations[schm].append(relname) + + return result + +def pg_dump_object(obj_dict, connectionInfo, envOpts): + for schema, table_list in obj_dict.iteritems(): + out_file = PATH_PREFIX + schema + '.dp.sql' + dmp_cmd = 'pg_dump -h %s -p %s -U %s -sxO %s' % connectionInfo + dmp_cmd = "%s -t '%s.%s' -f %s" % (dmp_cmd, E(schema), E('|'.join(table_list)), E(out_file)) + print dmp_cmd + p = subprocess.Popen(dmp_cmd, shell=True, stderr=subprocess.PIPE, env=envOpts) + if p.wait() is not 0: + sys.stderr.write('\nError while dumping schema.\n\n' + p.communicate()[1] + '\n\n') + sys.exit(1) + +def print_obj_ddl(filename, f_out): + if filename.endswith('.dp.sql'): + f_path = os.path.join(PATH_PREFIX, filename) + with open(f_path, 'r') as f_opened: + line_no = 1 + for line in f_opened: + if line_no == 12 or line_no > 16: + f_out.writelines(line) + line_no += 1 + +def dump_tuple_count(cur, oid_str, f_out): + stmt = "SELECT pgc.relname, pgn.nspname, pgc.relpages, pgc.reltuples FROM pg_class pgc, pg_namespace pgn " \ + "WHERE pgc.relnamespace = pgn.oid and pgc.oid in (%s) and pgn.nspname NOT LIKE 'pg_temp_%%' " \ + "and pgn.nspname NOT IN %s" % (oid_str, sysnslist) + + templateStmt = '-- Table: {1}\n' \ + 'UPDATE pg_class\nSET\n' \ + '{0}\n' \ + 'WHERE relname = \'{1}\' AND relnamespace = ' \ + '(SELECT oid FROM pg_namespace WHERE nspname = \'{2}\');\n\n' + + cur.execute(stmt) + columns = [x[0] for x in cur.description] + types = ['int', 'real'] + for vals in result_iter(cur): + lines = [] + for col, val, typ in zip(columns[2:], vals[2:], types): + # i.e. relpages = 1::int, reltuples = 1.0::real + lines.append('\t%s = %s::%s' % (col, val, typ)) + updateStmt = templateStmt.format(E(',\n'.join(lines)), E(vals[0]), E(vals[1])) + f_out.writelines(updateStmt) + +def dump_stats(cur, oid_str, f_out): + query = 'SELECT pgc.relname, pgn.nspname, pga.attname, pgt.typname, pgs.* ' \ + 'FROM pg_class pgc, pg_statistic pgs, pg_namespace pgn, pg_attribute pga, pg_type pgt ' \ + 'WHERE pgc.relnamespace = pgn.oid and pgc.oid in (%s) and pgn.nspname NOT IN %s ' \ + 'and pgn.nspname NOT LIKE \'pg_temp_%%\' ' \ + 'and pgc.oid = pgs.starelid ' \ + 'and pga.attrelid = pgc.oid ' \ + 'and pga.attnum = pgs.staattnum ' \ + 'and pga.atttypid = pgt.oid ' \ + 'ORDER BY pgc.relname, pgs.staattnum' % (oid_str, sysnslist) + + pstring = '--\n' \ + '-- Table: {0}, Attribute: {1}\n' \ + '--\n' \ + 'INSERT INTO pg_statistic VALUES (\n' \ + '{2});\n\n' + types = ['smallint', # staattnum + 'real', + 'integer', + 'real', + 'smallint', + 'smallint', + 'smallint', + 'smallint', + 'oid', + 'oid', + 'oid', + 'oid', + 'real[]', + 'real[]', + 'real[]', + 'real[]' + ] + + cur.execute(query) + + for vals in result_iter(cur): + rowVals = ["\t'%s.%s'::regclass" % (E(vals[1]), E(vals[0]))] + + if vals[3][0] == '_': + rowTypes = types + [vals[3]] * 4 + else: + rowTypes = types + [vals[3] + '[]'] * 4 + for val, typ in zip(vals[5:], rowTypes): + if val is None: + val = 'NULL' + elif isinstance(val, (str, unicode)) and val[0] == '{': + val = "E'%s'" % val + rowVals.append('\t{0}::{1}'.format(val, typ)) + f_out.writelines(pstring.format(E(vals[0]), E(vals[2]), ',\n'.join(rowVals))) + +def main(): + parser = parse_cmd_line() + options, args = parser.parse_args() + if len(args) != 1: + parser.error("No database specified") + exit(1) + + # OK - now let's setup all the arguments & options + envOpts = os.environ + db = args[0] + host = options.host or platform.node() + user = options.user or os.getlogin() + port = options.port or envOpts['PGPORT'] or '5432' + query_file = options.query_file + output_file = options.output_file + + if query_file is None: + parser.error("No query file specified.") + exit(1) + if output_file is None: + parser.error("No output file specified.") + exit(1) + if not os.path.isfile(query_file): + parser.error('Query file %s does not exist.' % query_file) + exit(1) + + timestamp = generate_timestamp() + global PATH_PREFIX + PATH_PREFIX = PATH_PREFIX + timestamp + '/' + + # Create tmp dir if not already there + try: + os.stat(PATH_PREFIX) + except: + os.mkdir(PATH_PREFIX) + + # setup the connection info tuple with options + connectionInfo = (host, port, user, db) + connectionString = ':'.join([host, port, db, user, '', pgoptions, '']) + print "Connecting to database: host=%s, port=%s, user=%s, db=%s ..." % connectionInfo + conn = pgdb.connect(connectionString) + cursor = conn.cursor() + + # get server version, which is dumped to minirepro output file + version = get_version(cursor) + + """ + invoke hawq_toolkit UDF, dump relation oids as json text + input: query file name + output: json oids string + """ + json_str = dump_query(cursor, query_file) + + """ + parse json oids string, collect all things that need to be dumped + input: json oids string + output: MRQuery class (self.relations, self.funcs, self.relids) + """ + mr_query = parse_oids(cursor, json_str) + + # dump tables and views + print "Invoking pg_dump to dump DDL ..." + pg_dump_object(mr_query.relations, connectionInfo, envOpts) + + # dump functions + # TODO #108977046 + + ### start writing out to stdout ### + output_dir = os.path.dirname(output_file) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + f_out = open(output_file, 'w') + ts = datetime.today() + f_out.writelines(['-- MiniRepro 1.0', + '\n-- Copyright 2016, The Apache Software Foundation' + '\n-- Database: ' + db, + '\n-- Date: ' + ts.date().isoformat(), + '\n-- Time: ' + ts.time().isoformat(), + '\n-- CmdLine: ' + ' '.join(sys.argv), + '\n-- Version: ' + version + '\n\n']) + + # Now be sure that when we load the rest we are doing it in the right + # database + f_out.writelines('\\connect ' + db + '\n\n') + + # first create schema DDLs + print "Writing schema DDLs ..." + table_schemas = ["CREATE SCHEMA %s;\n" % E(schema) for schema in mr_query.relations if schema != 'public'] + f_out.writelines(table_schemas) + + # write table & view DDLs + print "Writing table & view DDLs ..." + for f in os.listdir(PATH_PREFIX): + print_obj_ddl(f, f_out) + + # Now we have to explicitly allow editing of these pg_class & + # pg_statistic tables + f_out.writelines(['\n-- ', + '\n-- Allow system table modifications', + '\n-- ', + '\nset allow_system_table_mods="DML";\n\n']) + + + # dump table stats + print "Writing table statistics ..." + oid_str = ','.join(mr_query.relids) + dump_tuple_count(cursor, oid_str, f_out) + + # dump column stats + print "Writing column statistics ..." + dump_stats(cursor, oid_str, f_out) + + cursor.close() + conn.close() + + # attach query text + print "Attaching raw query text ..." + f_out.writelines(['\n-- ', + '\n-- Query text', + '\n-- \n\n']) + + with open(query_file, 'r') as query_f: + for line in query_f: + f_out.writelines('-- ' + line) + + f_out.writelines('\n-- MiniRepro completed.\n') + f_out.close() + + print "--- MiniRepro completed! ---" + +if __name__ == "__main__": + main()
