HAWQ-778. Refine hawq register, add sanity checks.
Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/3e3f93dd Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/3e3f93dd Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/3e3f93dd Branch: refs/heads/master Commit: 3e3f93dd1c0ddb92d4971ba0c88264c739973c31 Parents: 355c437 Author: Yancheng Luo <[email protected]> Authored: Fri Jun 3 14:48:33 2016 +0800 Committer: Lili Ma <[email protected]> Committed: Tue Jun 7 11:03:56 2016 +0800 ---------------------------------------------------------------------- src/test/feature/ManagementTool/test_hawq.paq | Bin 657 -> 0 bytes .../ManagementTool/test_hawq_register.cpp | 78 ++++++---- .../ManagementTool/test_hawq_register_hawq.paq | Bin 0 -> 657 bytes .../ManagementTool/test_hawq_register_hive.paq | Bin 0 -> 212 bytes .../ManagementTool/test_hawq_register_not_paq | Bin 0 -> 48 bytes src/test/feature/ManagementTool/test_hive.paq | Bin 212 -> 0 bytes src/test/feature/ManagementTool/test_not_paq | Bin 48 -> 0 bytes tools/bin/hawqpylib/HAWQ_HELP.py | 1 + tools/bin/hawqregister | 144 ++++++++++++------- tools/doc/hawqregister_help | 15 +- 10 files changed, 148 insertions(+), 90 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_hawq.paq ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_hawq.paq b/src/test/feature/ManagementTool/test_hawq.paq deleted file mode 100644 index f2adb4b..0000000 Binary files a/src/test/feature/ManagementTool/test_hawq.paq and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_hawq_register.cpp ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_hawq_register.cpp b/src/test/feature/ManagementTool/test_hawq_register.cpp index 328c19d..2efbd84 100644 --- a/src/test/feature/ManagementTool/test_hawq_register.cpp +++ b/src/test/feature/ManagementTool/test_hawq_register.cpp @@ -19,15 +19,15 @@ class TestHawqRegister : public ::testing::Test { TEST_F(TestHawqRegister, TestSingleHawqFile) { SQLUtility util; string rootPath(util.getTestRootPath()); - string relativePath("/testhawqregister/test_hawq.paq"); + string relativePath("/ManagementTool/test_hawq_register_hawq.paq"); string filePath = rootPath + relativePath; - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " /hawq_register_hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " hdfs://localhost:8020/hawq_register_hawq.paq")); util.execute("create table hawqregister(i int) with (appendonly=true, orientation=parquet);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(0, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hawq register postgres hawqregister hdfs://localhost:8020/hawq_register_hawq.paq")); util.query("select * from hawqregister;", 3); util.execute("insert into hawqregister values(1);"); @@ -38,15 +38,15 @@ TEST_F(TestHawqRegister, TestSingleHawqFile) { TEST_F(TestHawqRegister, TestSingleHiveFile) { SQLUtility util; string rootPath(util.getTestRootPath()); - string relativePath("/testhawqregister/test_hive.paq"); + string relativePath("/ManagementTool/test_hawq_register_hive.paq"); string filePath = rootPath + relativePath; - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " /hawq_register_hive.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " hdfs://localhost:8020/hawq_register_hive.paq")); util.execute("create table hawqregister(i int) with (appendonly=true, orientation=parquet);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(0, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_hive.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hawq register postgres hawqregister hdfs://localhost:8020/hawq_register_hive.paq")); util.query("select * from hawqregister;", 1); util.execute("insert into hawqregister values(1);"); @@ -57,83 +57,83 @@ TEST_F(TestHawqRegister, TestSingleHiveFile) { TEST_F(TestHawqRegister, TestFiles) { SQLUtility util; string rootPath(util.getTestRootPath()); - string relativePath("/testhawqregister/test_hawq.paq"); + string relativePath("/ManagementTool/test_hawq_register_hawq.paq"); string filePath1 = rootPath + relativePath; - relativePath = "/testhawqregister/test_hive.paq"; + relativePath = "/ManagementTool/test_hawq_register_hive.paq"; string filePath2 = rootPath + relativePath; - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -mkdir -p /hawq_register_test/t/t")); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath1 + " /hawq_register_test/hawq1.paq")); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath1 + " /hawq_register_test/hawq2.paq")); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath1 + " /hawq_register_test/t/hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -mkdir -p hdfs://localhost:8020/hawq_register_test/t/t")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath1 + " hdfs://localhost:8020/hawq_register_test/hawq1.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath1 + " hdfs://localhost:8020/hawq_register_test/hawq2.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath1 + " hdfs://localhost:8020/hawq_register_test/t/hawq.paq")); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath2 + " /hawq_register_test/hive1.paq")); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath2 + " /hawq_register_test/hive2.paq")); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath2 + " /hawq_register_test/t/hive.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath2 + " hdfs://localhost:8020/hawq_register_test/hive1.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath2 + " hdfs://localhost:8020/hawq_register_test/hive2.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath2 + " hdfs://localhost:8020/hawq_register_test/t/hive.paq")); util.execute("create table hawqregister(i int) with (appendonly=true, orientation=parquet);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(0, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_test")); + EXPECT_EQ(0, Command::getCommandStatus("hawq register postgres hawqregister hdfs://localhost:8020/hawq_register_test")); util.query("select * from hawqregister;", 12); util.execute("insert into hawqregister values(1);"); util.query("select * from hawqregister;", 13); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm -r /hawq_register_test")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm -r hdfs://localhost:8020/hawq_register_test")); util.execute("drop table hawqregister;"); } TEST_F(TestHawqRegister, TestHashDistributedTable) { SQLUtility util; string rootPath(util.getTestRootPath()); - string relativePath("/testhawqregister/test_hawq.paq"); + string relativePath("/ManagementTool/test_hawq_register_hawq.paq"); string filePath = rootPath + relativePath; - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " /hawq_register_hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " hdfs://localhost:8020/hawq_register_hawq.paq")); util.execute("create table hawqregister(i int) with (appendonly=true, orientation=parquet) distributed by (i);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_hawq.paq")); + EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister hdfs://localhost:8020/hawq_register_hawq.paq")); util.query("select * from hawqregister;", 0); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm /hawq_register_hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm hdfs://localhost:8020/hawq_register_hawq.paq")); util.execute("drop table hawqregister;"); } TEST_F(TestHawqRegister, TestNotParquetFile) { SQLUtility util; string rootPath(util.getTestRootPath()); - string relativePath("/testhawqregister/test_not_paq"); + string relativePath("/ManagementTool/test_hawq_register_not_paq"); string filePath = rootPath + relativePath; - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " /hawq_register_test_not_paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " hdfs://localhost:8020/hawq_register_test_not_paq")); util.execute("create table hawqregister(i int) with (appendonly=true, orientation=parquet);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_test_not_paq")); + EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister hdfs://localhost:8020/hawq_register_test_not_paq")); util.query("select * from hawqregister;", 0); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm /hawq_register_test_not_paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm hdfs://localhost:8020/hawq_register_test_not_paq")); util.execute("drop table hawqregister;"); } TEST_F(TestHawqRegister, TestNotParquetTable) { SQLUtility util; string rootPath(util.getTestRootPath()); - string relativePath("/testhawqregister/test_hawq.paq"); + string relativePath("/ManagementTool/test_hawq_register_hawq.paq"); string filePath = rootPath + relativePath; - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " /hawq_register_hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " hdfs://localhost:8020/hawq_register_hawq.paq")); util.execute("create table hawqregister(i int);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_hawq.paq")); + EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister hdfs://localhost:8020/hawq_register_hawq.paq")); util.query("select * from hawqregister;", 0); - EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm /hawq_register_hawq.paq")); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm hdfs://localhost:8020/hawq_register_hawq.paq")); util.execute("drop table hawqregister;"); } @@ -143,8 +143,26 @@ TEST_F(TestHawqRegister, TestFileNotExist) { util.execute("create table hawqregister(i int);"); util.query("select * from hawqregister;", 0); - EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_file_not_exist")); + EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister /hdfs://localhost:8020hawq_register_file_not_exist")); + util.query("select * from hawqregister;", 0); + + util.execute("drop table hawqregister;"); +} + +TEST_F(TestHawqRegister, TestNotHDFSPath) { + SQLUtility util; + string rootPath(util.getTestRootPath()); + string relativePath("/ManagementTool/test_hawq_register_hawq.paq"); + string filePath = rootPath + relativePath; + + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -put " + filePath + " hdfs://localhost:8020/hawq_register_hawq.paq")); + + util.execute("create table hawqregister(i int);"); + util.query("select * from hawqregister;", 0); + + EXPECT_EQ(1, Command::getCommandStatus("hawq register postgres hawqregister /hawq_register_hawq.paq")); util.query("select * from hawqregister;", 0); + EXPECT_EQ(0, Command::getCommandStatus("hadoop fs -rm hdfs://localhost:8020/hawq_register_hawq.paq")); util.execute("drop table hawqregister;"); } http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_hawq_register_hawq.paq ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_hawq_register_hawq.paq b/src/test/feature/ManagementTool/test_hawq_register_hawq.paq new file mode 100644 index 0000000..f2adb4b Binary files /dev/null and b/src/test/feature/ManagementTool/test_hawq_register_hawq.paq differ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_hawq_register_hive.paq ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_hawq_register_hive.paq b/src/test/feature/ManagementTool/test_hawq_register_hive.paq new file mode 100644 index 0000000..a356fc7 Binary files /dev/null and b/src/test/feature/ManagementTool/test_hawq_register_hive.paq differ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_hawq_register_not_paq ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_hawq_register_not_paq b/src/test/feature/ManagementTool/test_hawq_register_not_paq new file mode 100644 index 0000000..dc75c44 Binary files /dev/null and b/src/test/feature/ManagementTool/test_hawq_register_not_paq differ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_hive.paq ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_hive.paq b/src/test/feature/ManagementTool/test_hive.paq deleted file mode 100644 index a356fc7..0000000 Binary files a/src/test/feature/ManagementTool/test_hive.paq and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/src/test/feature/ManagementTool/test_not_paq ---------------------------------------------------------------------- diff --git a/src/test/feature/ManagementTool/test_not_paq b/src/test/feature/ManagementTool/test_not_paq deleted file mode 100644 index dc75c44..0000000 Binary files a/src/test/feature/ManagementTool/test_not_paq and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/tools/bin/hawqpylib/HAWQ_HELP.py ---------------------------------------------------------------------- diff --git a/tools/bin/hawqpylib/HAWQ_HELP.py b/tools/bin/hawqpylib/HAWQ_HELP.py index e0d901f..b3a2dee 100755 --- a/tools/bin/hawqpylib/HAWQ_HELP.py +++ b/tools/bin/hawqpylib/HAWQ_HELP.py @@ -37,6 +37,7 @@ The most commonly used hawq "commands" are: ssh-exkeys Exchanges SSH public keys between hosts. check Verifies and validates HAWQ settings. checkperf Verifies the baseline hardware performance of hosts. + register Register parquet files generated by other system into the corrsponding table in HAWQ See 'hawq <command> help' for more information on a specific command. """ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/tools/bin/hawqregister ---------------------------------------------------------------------- diff --git a/tools/bin/hawqregister b/tools/bin/hawqregister index 876aed7..8e0e621 100755 --- a/tools/bin/hawqregister +++ b/tools/bin/hawqregister @@ -39,6 +39,7 @@ logger = get_default_logger() EXECNAME = os.path.split(__file__)[-1] setup_tool_logging(EXECNAME,getLocalHostname(),getUserName()) + def create_opt_parser(version): parser = OptParser(option_class=OptChecker, usage='usage: %prog [options] database_name table_name file_or_dir_path_in_hdfs', @@ -51,11 +52,20 @@ def create_opt_parser(version): return parser +def check_hadoop_command(): + hdfscmd = "hadoop" + result = local_ssh(hdfscmd); + if result != 0: + logger.error("command 'hadoop' is not available, please set environment variable $PATH to fix this") + sys.exit(1) + + def get_seg_name(options, databasename, tablename): try: relfilenode = 0 relname = "" - query = "select pg_class2.relname from pg_class as pg_class1, pg_appendonly, pg_class as pg_class2 where pg_class1.relname ='%s' and pg_class1.oid = pg_appendonly.relid and pg_appendonly.segrelid = pg_class2.oid;" % tablename + query = ("select pg_class2.relname from pg_class as pg_class1, pg_appendonly, pg_class as pg_class2 where pg_class1.relname ='%s' " + "and pg_class1.oid = pg_appendonly.relid and pg_appendonly.segrelid = pg_class2.oid;") % tablename dburl = dbconn.DbURL(hostname=options.host, port=options.port, username=options.user, dbname=databasename) conn = dbconn.connect(dburl, True) rows = dbconn.execSQL(conn, query) @@ -103,6 +113,61 @@ def check_hash_type(options, databasename, tablename): sys.exit(1) +def get_metadata_from_database(options, databasename, tablename, seg_name): + try: + query = "select segno from pg_aoseg.%s;" % seg_name + dburl = dbconn.DbURL(hostname=options.host, port=options.port, username=options.user, dbname=databasename) + conn = dbconn.connect(dburl, False) + rows = dbconn.execSQL(conn, query) + conn.commit() + conn.close() + + except DatabaseError, ex: + logger.error("Failed to connect to database, this script can only be run when the database is up") + logger.error("host = %s, port = %d, user = %s, dbname = %s, query = %s" % (options.host, options.port, options.user, databasename, query)) + sys.exit(1) + + firstsegno = rows.rowcount + 1 + + # get the full path of correspoding file for target table + try: + query = ("select location, gp_persistent_tablespace_node.tablespace_oid, database_oid, relfilenode from pg_class, gp_persistent_relation_node, " + "gp_persistent_tablespace_node, gp_persistent_filespace_node where relname = '%s' and pg_class.relfilenode = " + "gp_persistent_relation_node.relfilenode_oid and gp_persistent_relation_node.tablespace_oid = gp_persistent_tablespace_node.tablespace_oid " + "and gp_persistent_filespace_node.filespace_oid = gp_persistent_filespace_node.filespace_oid;") % tablename + dburl = dbconn.DbURL(hostname=options.host, port=options.port, username=options.user, dbname=databasename) + conn = dbconn.connect(dburl, False) + rows = dbconn.execSQL(conn, query) + conn.commit() + conn.close() + + except DatabaseError, ex: + logger.error("Failed to connect to database, this script can only be run when the database is up") + logger.error("host = %s, port = %d, user = %s, dbname = %s, query = %s" % (options.host, options.port, options.user, databasename, query)) + sys.exit(1) + + for row in rows: + tabledir = row[0].strip() + "/" + str(row[1]) + "/" + str(row[2]) + "/" + str(row[3]) + "/" + + return firstsegno, tabledir + + +def check_files_and_table_in_same_hdfs_cluster(filepath, tabledir): + # check whether the files to be registered is in hdfs + filesystem = filepath.split('://') + if filesystem[0] != 'hdfs': + logger.error("Only support to register file(s) in hdfs") + sys.exit(1) + fileroot = filepath.split('/') + tableroot = tabledir.split('/') + print fileroot + print tableroot + # check the root url of them. eg: for 'hdfs://localhost:8020/temp/tempfile', we check 'hdfs://localohst:8020' + if fileroot[0] != tableroot[0] or fileroot[1] != tableroot[1] or fileroot[2] != tableroot[2]: + logger.error("Files to be registered and the table are not in the same hdfs cluster.") + sys.exit(1) + + def get_files_in_hdfs(filepath): files = [] sizes = [] @@ -143,48 +208,13 @@ def check_parquet_format(options, files): sys.exit(1) -def get_metadata_from_database(options, databasename, seg_name): - try: - query = "select segno from pg_aoseg.%s;" % seg_name - dburl = dbconn.DbURL(hostname=options.host, port=options.port, username=options.user, dbname=databasename) - conn = dbconn.connect(dburl, False) - rows = dbconn.execSQL(conn, query) - conn.commit() - conn.close() - - except DatabaseError, ex: - logger.error("Failed to connect to database, this script can only be run when the database is up") - logger.error("host = %s, port = %d, user = %s, dbname = %s, query = %s" % (options.host, options.port, options.user, databasename, query)) - sys.exit(1) - - return rows.rowcount + 1 - - -def move_files_in_hdfs(options, databasename, tablename, files, firstsegno, normal): - # get the full path of correspoding file for target table - try: - query = "select location, gp_persistent_tablespace_node.tablespace_oid, database_oid, relfilenode from pg_class, gp_persistent_relation_node, gp_persistent_tablespace_node, gp_persistent_filespace_node where relname = '%s' and pg_class.relfilenode = gp_persistent_relation_node.relfilenode_oid and gp_persistent_relation_node.tablespace_oid = gp_persistent_tablespace_node.tablespace_oid and gp_persistent_filespace_node.filespace_oid = gp_persistent_filespace_node.filespace_oid;" % tablename - dburl = dbconn.DbURL(hostname=options.host, port=options.port, username=options.user, dbname=databasename) - conn = dbconn.connect(dburl, False) - rows = dbconn.execSQL(conn, query) - conn.commit() - conn.close() - - except DatabaseError, ex: - logger.error("Failed to connect to database, this script can only be run when the database is up") - logger.error("host = %s, port = %d, user = %s, dbname = %s, query = %s" % (options.host, options.port, options.user, databasename, query)) - sys.exit(1) - - +def move_files_in_hdfs(options, databasename, tablename, files, firstsegno, tabledir, normal): # move file(s) in src path into the folder correspoding to the target table if (normal == True): - for row in rows: - destdir = row[0].strip() + "/" + str(row[1]) + "/" + str(row[2]) + "/" + str(row[3]) + "/" - segno = firstsegno for file in files: srcfile = file - dstfile = destdir + str(segno) + dstfile = tabledir + str(segno) segno += 1 if srcfile != dstfile: hdfscmd = "hadoop fs -mv %s %s" % (srcfile, dstfile) @@ -194,13 +224,10 @@ def move_files_in_hdfs(options, databasename, tablename, files, firstsegno, norm logger.error("Fail to move '%s' to '%s'" % (srcfile, dstfile)) sys.exit(1) else: - for row in rows: - srcdir = row[0].strip() + "/" + str(row[1]) + "/" + str(row[2]) + "/" + str(row[3]) + "/" - segno = firstsegno for file in files: dstfile = file - srcfile = srcdir + str(segno) + srcfile = tabledir + str(segno) segno += 1 if srcfile != dstfile: hdfscmd = "hadoop fs -mv %s %s" % (srcfile, dstfile) @@ -211,7 +238,7 @@ def move_files_in_hdfs(options, databasename, tablename, files, firstsegno, norm sys.exit(1) -def insert_metadata_into_database(options, databasename, tablename, seg_name, firstsegno, eofs): +def insert_metadata_into_database(options, databasename, tablename, seg_name, firstsegno, tabledir, eofs): try: query = "SET allow_system_table_mods='dml';" segno = firstsegno @@ -228,7 +255,7 @@ def insert_metadata_into_database(options, databasename, tablename, seg_name, fi except DatabaseError, ex: logger.error("Failed to connect to database, this script can only be run when the database is up") logger.error("host = %s, port = %d, user = %s, dbname = %s, query = %s" % (options.host, options.port, options.user, databasename, query)) - move_files_in_hdfs(options, databasename, tablename, files, firstsegno, False) + move_files_in_hdfs(options, databasename, tablename, files, firstsegno, tabledir, False) sys.exit(1) @@ -245,28 +272,37 @@ def main(args=None): tablename = args[1] filepath = args[2] - # 1. get the seg_name from database + # 1. check whether the path of shell command 'hadoop' is set. + check_hadoop_command() + + # 2. get the seg_name from database seg_name = get_seg_name(options, databasename, tablename) - # 2. check whether target table is hash-typed, in that case simple insertion does not work + # 3. check whether target table is hash-typed, in that case simple insertion does not work result = check_hash_type(options, databasename, tablename) - # 3. get all the files refered by 'filepath', which could be a file or a directory containing all the files + # 4. get the metadata to be inserted from hdfs + firstsegno, tabledir = get_metadata_from_database(options, databasename, tablename, seg_name) + + # 5. check whether all the files refered by 'filepath' and the location corresponding to the table are in the same hdfs cluster + check_files_and_table_in_same_hdfs_cluster(filepath, tabledir) + + # 6. get all the files refered by 'filepath', which could be a file or a directory containing all the files files, sizes = get_files_in_hdfs(filepath) print "File(s) to be registered:" print files - # 4. check whether the file to be registered is parquet format + # 7. check whether the file to be registered is parquet format check_parquet_format(options, files) - # 5. get the metadata to be inserted from hdfs - firstsegno = get_metadata_from_database(options, databasename, seg_name) + # 8. move the file in hdfs to proper location + move_files_in_hdfs(options, databasename, tablename, files, firstsegno, tabledir, True) - # 6. move the file in hdfs to proper location - move_files_in_hdfs(options, databasename, tablename, files, firstsegno, True) + # 9. insert the metadata into database + insert_metadata_into_database(options, databasename, tablename, seg_name, firstsegno, tabledir, sizes) - # 7. insert the metadata into database - insert_metadata_into_database(options, databasename, tablename, seg_name, firstsegno, sizes) + # 10. report the final status of hawq register + logger.info("Hawq register succeed.") if __name__ == '__main__': sys.exit(main()) http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/3e3f93dd/tools/doc/hawqregister_help ---------------------------------------------------------------------- diff --git a/tools/doc/hawqregister_help b/tools/doc/hawqregister_help index d5a5a0d..ade1e3a 100644 --- a/tools/doc/hawqregister_help +++ b/tools/doc/hawqregister_help @@ -30,6 +30,8 @@ User have to make sure that the meta data of the parquet file(s) and the table are consistent. The table to be registered into should not be hash distributed, which is created by using "distributed by" statement when creating that table. +The file(s) to be registered and the table in HAWQ must be in the +same HDFS cluster. ***************************************************** Arguments @@ -86,17 +88,18 @@ CONNECTION OPTIONS EXAMPLES ***************************************************** -Run "hawq register" to register a parquet file with path '/temp/hive.paq' -in hdfs generated by hive into table 'parquet_table' in HAWQ, which is in the -database named 'postgres'. +Run "hawq register" to register a parquet file in HDFS with path +'hdfs://localhost:8020/temp/hive.paq' generated by hive into table +'parquet_table' in HAWQ, which is in the database named 'postgres'. + Assume the location of the database is 'hdfs://localhost:8020/hawq_default', tablespace id is '16385', database id is '16387', table filenode id is '77160', last file under the filenode numbered '7'. -$ hawq register postgres parquet_table /temp/hive.paq +$ hawq register postgres parquet_table hdfs://localhost:8020/temp/hive.paq -This will move the file '/temp/hive.paq' into the corresponding new place -'hdfs://localhost:8020/hawq_default/16385/16387/77160/8' in hdfs, then +This will move the file 'hdfs://localhost:8020/temp/hive.paq' into the corresponding +new place 'hdfs://localhost:8020/hawq_default/16385/16387/77160/8' in HDFS, then update the meta data of the table 'parquet_table' in HAWQ which is in the table 'pg_aoseg.pg_paqseg_77160'.
