Repository: hive Updated Branches: refs/heads/master a6366fc07 -> bc88d53a2
HIVE-15430: Change SchemaTool table validator to test based on the dbType (Naveen Gangam, reviewed by Aihua Xu) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/bc88d53a Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/bc88d53a Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/bc88d53a Branch: refs/heads/master Commit: bc88d53a2b38966148920da0411f5f5ed49213dd Parents: a6366fc Author: Aihua Xu <aihu...@apache.org> Authored: Wed Mar 1 11:59:48 2017 -0500 Committer: Aihua Xu <aihu...@apache.org> Committed: Wed Mar 1 11:59:48 2017 -0500 ---------------------------------------------------------------------- .../org/apache/hive/beeline/HiveSchemaTool.java | 192 ++++++++++++++++--- .../org/apache/hive/beeline/TestSchemaTool.java | 18 +- 2 files changed, 179 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/bc88d53a/beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java ---------------------------------------------------------------------- diff --git a/beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java b/beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java index 96b2978..2c088c9 100644 --- a/beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java +++ b/beeline/src/java/org/apache/hive/beeline/HiveSchemaTool.java @@ -181,7 +181,7 @@ public class HiveSchemaTool { try(Statement stmt = metastoreConn.createStatement(); ResultSet res = stmt.executeQuery(versionQuery)) { if (!res.next()) { - throw new HiveMetaException("Didn't find version data in metastore"); + throw new HiveMetaException("Could not find version info in metastore VERSION table"); } String currentSchemaVersion = res.getString(1); if (checkDuplicatedVersion && res.next()) { @@ -189,7 +189,7 @@ public class HiveSchemaTool { } return currentSchemaVersion; } catch (SQLException e) { - throw new HiveMetaException("Failed to get schema version.", e); + throw new HiveMetaException("Failed to get schema version, Cause:" + e.getMessage()); } } @@ -199,6 +199,7 @@ public class HiveSchemaTool { rtn = checkMetaStoreDBLocation(conn, defaultServers); rtn = checkMetaStoreTableLocation(conn, defaultServers) && rtn; rtn = checkMetaStorePartitionLocation(conn, defaultServers) && rtn; + rtn = checkMetaStoreSkewedColumnsLocation(conn, defaultServers) && rtn; System.out.println((rtn ? "Succeeded" : "Failed") + " in database/table/partition location validation"); return rtn; } @@ -361,6 +362,64 @@ public class HiveSchemaTool { return isValid; } + private boolean checkMetaStoreSkewedColumnsLocation(Connection conn, URI[] defaultServers) + throws HiveMetaException { + String skewedColLoc, skewedColIDRange; + boolean isValid = true; + int numOfInvalid = 0; + if (getDbCommandParser(dbType).needsQuotedIdentifier()) { + skewedColIDRange = "select max(\"STRING_LIST_ID_KID\"), min(\"STRING_LIST_ID_KID\") from \"SKEWED_COL_VALUE_LOC_MAP\" "; + } else { + skewedColIDRange = "select max(STRING_LIST_ID_KID), min(STRING_LIST_ID_KID) from SKEWED_COL_VALUE_LOC_MAP"; + } + + if (getDbCommandParser(dbType).needsQuotedIdentifier()) { + skewedColLoc = "select t.\"TBL_NAME\", t.\"TBL_ID\", sk.\"STRING_LIST_ID_KID\", sk.\"LOCATION\" from \"TBLS\" t, \"SDS\" s, \"SKEWED_COL_VALUE_LOC_MAP\" sk " + + "where sk.\"SD_ID\" = s.\"SD_ID\" and s.\"SD_ID\" = t.\"SD_ID\" and sk.\"STRING_LIST_ID_KID\" >= ? and sk.\"STRING_LIST_ID_KID\" <= ? "; + } else { + skewedColLoc = "select t.TBL_NAME, t.TBL_ID, sk.STRING_LIST_ID_KID, sk.LOCATION from TBLS t, SDS s, SKEWED_COL_VALUE_LOC_MAP sk " + + "where sk.SD_ID = s.SD_ID and s.SD_ID = t.SD_ID and sk.STRING_LIST_ID_KID >= ? and sk.STRING_LIST_ID_KID <= ? "; + } + + long maxID = 0, minID = 0; + long rtnSize = 2000; + + try { + Statement stmt = conn.createStatement(); + ResultSet res = stmt.executeQuery(skewedColIDRange); + if (res.next()) { + maxID = res.getLong(1); + minID = res.getLong(2); + } + res.close(); + stmt.close(); + PreparedStatement pStmt = conn.prepareStatement(skewedColLoc); + while (minID <= maxID) { + pStmt.setLong(1, minID); + pStmt.setLong(2, minID + rtnSize); + res = pStmt.executeQuery(); + while (res.next()) { + String locValue = res.getString(4); + String entity = "Table " + getNameOrID(res,1,2) + + ", String list " + res.getString(3); + if (!checkLocation(entity, locValue, defaultServers)) { + numOfInvalid++; + } + } + res.close(); + minID += rtnSize + 1; + } + pStmt.close(); + } catch (SQLException e) { + throw new HiveMetaException("Failed to get skewed columns location info.", e); + } + if (numOfInvalid > 0) { + isValid = false; + System.err.println("Total number of invalid SKEWED_COL_VALUE_LOC_MAP locations is: "+ numOfInvalid); + } + return isValid; + } + /** * Check if the location is valid for the given entity * @param entity the entity to represent a database, partition or table @@ -537,11 +596,26 @@ public class HiveSchemaTool { System.out.println("Starting metastore validation"); Connection conn = getConnectionToMetastore(false); try { - validateSchemaVersions(conn); - validateSequences(conn); - validateSchemaTables(conn); - validateLocations(conn, this.validationServers); - validateColumnNullValues(conn); + if (validateSchemaVersions(conn)) + System.out.println("[SUCCESS]\n"); + else + System.out.println("[FAIL]\n"); + if (validateSequences(conn)) + System.out.println("[SUCCESS]\n"); + else + System.out.println("[FAIL]\n"); + if (validateSchemaTables(conn)) + System.out.println("[SUCCESS]\n"); + else + System.out.println("[FAIL]\n"); + if (validateLocations(conn, this.validationServers)) + System.out.println("[SUCCESS]\n"); + else + System.out.println("[FAIL]\n"); + if (validateColumnNullValues(conn)) + System.out.println("[SUCCESS]\n"); + else + System.out.println("[FAIL]\n"); } finally { if (conn != null) { try { @@ -620,7 +694,7 @@ public class HiveSchemaTool { } catch (HiveMetaException hme) { if (hme.getMessage().contains("Metastore schema version is not compatible") || hme.getMessage().contains("Multiple versions were found in metastore") - || hme.getMessage().contains("Didn't find version data in metastore")) { + || hme.getMessage().contains("Could not find version info in metastore VERSION table")) { System.out.println("Failed in schema version validation: " + hme.getMessage()); return false; } else { @@ -632,14 +706,27 @@ public class HiveSchemaTool { } boolean validateSchemaTables(Connection conn) throws HiveMetaException { + String version = null; ResultSet rs = null; DatabaseMetaData metadata = null; List<String> dbTables = new ArrayList<String>(); List<String> schemaTables = new ArrayList<String>(); List<String> subScripts = new ArrayList<String>(); - String version = getMetaStoreSchemaVersion(conn); + Connection hmsConn = getConnectionToMetastore(false); + + System.out.println("Validating metastore schema tables"); + try { + version = getMetaStoreSchemaVersion(hmsConn); + } catch (HiveMetaException he) { + System.err.println("Failed to determine schema version from Hive Metastore DB," + he.getMessage()); + LOG.error("Failed to determine schema version from Hive Metastore DB," + he.getMessage()); + return false; + } + + // re-open the hms connection + hmsConn = getConnectionToMetastore(false); - System.out.println("Validating tables in the schema for version " + version); + LOG.info("Validating tables in the schema for version " + version); try { metadata = conn.getMetaData(); String[] types = {"TABLE"}; @@ -652,7 +739,7 @@ public class HiveSchemaTool { LOG.debug("Found table " + table + " in HMS dbstore"); } } catch (SQLException e) { - throw new HiveMetaException(e); + throw new HiveMetaException("Failed to retrieve schema tables from Hive Metastore DB," + e.getMessage()); } finally { if (rs != null) { try { @@ -666,20 +753,24 @@ public class HiveSchemaTool { // parse the schema file to determine the tables that are expected to exist // we are using oracle schema because it is simpler to parse, no quotes or backticks etc String baseDir = new File(metaStoreSchemaInfo.getMetaStoreScriptDir()).getParent(); - String schemaFile = baseDir + "/oracle/hive-schema-" + version + ".oracle.sql"; + String schemaFile = baseDir + "/" + dbType + "/hive-schema-" + version + "." + dbType + ".sql"; try { LOG.debug("Parsing schema script " + schemaFile); subScripts.addAll(findCreateTable(schemaFile, schemaTables)); while (subScripts.size() > 0) { - schemaFile = baseDir + "/oracle/" + subScripts.remove(0); - LOG.debug("Parsing subscript " + schemaFile); + schemaFile = baseDir + "/" + dbType + "/" + subScripts.remove(0); + LOG.info("Parsing subscript " + schemaFile); subScripts.addAll(findCreateTable(schemaFile, schemaTables)); } } catch (Exception e) { + System.err.println("Exception in parsing schema file. Cause:" + e.getMessage()); + System.out.println("Schema table validation failed!!!"); return false; } + LOG.debug("Schema tables:[ " + Arrays.toString(schemaTables.toArray()) + " ]"); + LOG.debug("DB tables:[ " + Arrays.toString(dbTables.toArray()) + " ]"); // now diff the lists int schemaSize = schemaTables.size(); schemaTables.removeAll(dbTables); @@ -690,37 +781,84 @@ public class HiveSchemaTool { System.out.println("Schema table validation failed!!!"); return false; } else { - System.out.println("Succeeded in schema table validation"); + System.out.println("Succeeded in schema table validation. " + schemaSize + " tables matched"); return true; } } - private List<String> findCreateTable(String path, List<String> tableList) { - Matcher matcher = null; - String line = null; - List<String> subs = new ArrayList<String>(); - final String NESTED_SCRIPT_IDENTIFIER = "@"; - Pattern regexp = Pattern.compile("(CREATE TABLE(IF NOT EXISTS)*) (\\S+).*"); + private List<String> findCreateTable(String path, List<String> tableList) + throws Exception { + NestedScriptParser sp = HiveSchemaHelper.getDbCommandParser(dbType); + Matcher matcher = null; + Pattern regexp = null; + List<String> subs = new ArrayList<String>(); + int groupNo = 0; + + switch (dbType) { + case HiveSchemaHelper.DB_ORACLE: + regexp = Pattern.compile("(CREATE TABLE(IF NOT EXISTS)*) (\\S+).*"); + groupNo = 3; + break; + + case HiveSchemaHelper.DB_MYSQL: + regexp = Pattern.compile("(CREATE TABLE) (\\S+).*"); + groupNo = 2; + break; + + case HiveSchemaHelper.DB_MSSQL: + regexp = Pattern.compile("(CREATE TABLE) (\\S+).*"); + groupNo = 2; + break; + + case HiveSchemaHelper.DB_DERBY: + regexp = Pattern.compile("(CREATE TABLE(IF NOT EXISTS)*) (\\S+).*"); + groupNo = 3; + break; + + case HiveSchemaHelper.DB_POSTGRACE: + regexp = Pattern.compile("(CREATE TABLE(IF NOT EXISTS)*) (\\S+).*"); + groupNo = 3; + break; + + default: + regexp = Pattern.compile("(CREATE TABLE(IF NOT EXISTS)*) (\\S+).*"); + groupNo = 3; + break; + } + + if (!(new File(path)).exists()) { + throw new Exception(path + " does not exist. Potentially incorrect version in the metastore VERSION table"); + } try ( BufferedReader reader = new BufferedReader(new FileReader(path)); ){ + String line = null; while ((line = reader.readLine()) != null) { - if (line.startsWith(NESTED_SCRIPT_IDENTIFIER)) { - int endIndex = (line.indexOf(";") > -1 ) ? line.indexOf(";") : line.length(); - // remove the trailing SEMI-COLON if any - subs.add(line.substring(NESTED_SCRIPT_IDENTIFIER.length(), endIndex)); + if (sp.isNestedScript(line)) { + String subScript = null; + subScript = sp.getScriptName(line); + LOG.debug("Schema subscript " + subScript + " found"); + subs.add(subScript); continue; } + line = line.replaceAll("\\(", " "); + line = line.replaceAll("IF NOT EXISTS ", ""); + line = line.replaceAll("`",""); + line = line.replaceAll("'",""); + line = line.replaceAll("\"",""); matcher = regexp.matcher(line); + if (matcher.find()) { - String table = matcher.group(3); + String table = matcher.group(groupNo); + if (dbType.equals("derby")) + table = table.replaceAll("APP.",""); tableList.add(table.toLowerCase()); LOG.debug("Found table " + table + " in the schema"); } } } catch (IOException ex){ - ex.printStackTrace(); + throw new Exception(ex.getMessage()); } return subs; http://git-wip-us.apache.org/repos/asf/hive/blob/bc88d53a/itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java ---------------------------------------------------------------------- diff --git a/itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java b/itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java index 5241807..22630b9 100644 --- a/itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java +++ b/itests/hive-unit/src/test/java/org/apache/hive/beeline/TestSchemaTool.java @@ -616,8 +616,12 @@ public class TestSchemaTool extends TestCase { "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (2,null,'org.apache.hadoop.mapred.TextInputFormat','N','N','hdfs://myhost.com:8020/user/admin/2015_11_18',-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (3,null,'org.apache.hadoop.mapred.TextInputFormat','N','N',null,-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", "insert into TBLS(TBL_ID,CREATE_TIME,DB_ID,LAST_ACCESS_TIME,OWNER,RETENTION,SD_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT,VIEW_ORIGINAL_TEXT,IS_REWRITE_ENABLED) values (2 ,1435255431,2,0 ,'hive',0,1,'mytal','MANAGED_TABLE',NULL,NULL,'n')", - "insert into PARTITiONS(PART_ID,CREATE_TIME,LAST_ACCESS_TIME, PART_NAME,SD_ID,TBL_ID) values(1, 1441402388,0, 'd1=1/d2=1',2,2)", - "insert into TBLS(TBL_ID,CREATE_TIME,DB_ID,LAST_ACCESS_TIME,OWNER,RETENTION,SD_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT,VIEW_ORIGINAL_TEXT,IS_REWRITE_ENABLED) values (3 ,1435255431,2,0 ,'hive',0,3,'myView','VIRTUAL_VIEW','select a.col1,a.col2 from foo','select * from foo','n')" + "insert into PARTITIONS(PART_ID,CREATE_TIME,LAST_ACCESS_TIME, PART_NAME,SD_ID,TBL_ID) values(1, 1441402388,0, 'd1=1/d2=1',2,2)", + "insert into TBLS(TBL_ID,CREATE_TIME,DB_ID,LAST_ACCESS_TIME,OWNER,RETENTION,SD_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT,VIEW_ORIGINAL_TEXT,IS_REWRITE_ENABLED) values (3 ,1435255431,2,0 ,'hive',0,3,'myView','VIRTUAL_VIEW','select a.col1,a.col2 from foo','select * from foo','n')", + "insert into SKEWED_STRING_LIST values(1)", + "insert into SKEWED_STRING_LIST values(2)", + "insert into SKEWED_COL_VALUE_LOC_MAP values(1,1,'hdfs://myhost.com:8020/user/hive/warehouse/mytal/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/')", + "insert into SKEWED_COL_VALUE_LOC_MAP values(2,2,'s3://myhost.com:8020/user/hive/warehouse/mytal/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/')" }; File scriptFile = generateTestScript(scripts); schemaTool.runBeeLine(scriptFile.getPath()); @@ -626,6 +630,8 @@ public class TestSchemaTool extends TestCase { isValid = schemaTool.validateLocations(conn, new URI[] {defaultRoot, defaultRoot2}); assertTrue(isValid); scripts = new String[] { + "delete from SKEWED_COL_VALUE_LOC_MAP", + "delete from SKEWED_STRING_LIST", "delete from PARTITIONS", "delete from TBLS", "delete from SDS", @@ -634,11 +640,15 @@ public class TestSchemaTool extends TestCase { "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (1,null,'org.apache.hadoop.mapred.TextInputFormat','N','N','hdfs://yourhost.com:8020/user/hive/warehouse/mydb',-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (2,null,'org.apache.hadoop.mapred.TextInputFormat','N','N','file:///user/admin/2015_11_18',-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", "insert into TBLS(TBL_ID,CREATE_TIME,DB_ID,LAST_ACCESS_TIME,OWNER,RETENTION,SD_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT,VIEW_ORIGINAL_TEXT,IS_REWRITE_ENABLED) values (2 ,1435255431,2,0 ,'hive',0,1,'mytal','MANAGED_TABLE',NULL,NULL,'n')", - "insert into PARTITiONS(PART_ID,CREATE_TIME,LAST_ACCESS_TIME, PART_NAME,SD_ID,TBL_ID) values(1, 1441402388,0, 'd1=1/d2=1',2,2)", + "insert into PARTITIONS(PART_ID,CREATE_TIME,LAST_ACCESS_TIME, PART_NAME,SD_ID,TBL_ID) values(1, 1441402388,0, 'd1=1/d2=1',2,2)", "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (3000,null,'org.apache.hadoop.mapred.TextInputFormat','N','N','yourhost.com:8020/user/hive/warehouse/mydb',-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", "insert into SDS(SD_ID,CD_ID,INPUT_FORMAT,IS_COMPRESSED,IS_STOREDASSUBDIRECTORIES,LOCATION,NUM_BUCKETS,OUTPUT_FORMAT,SERDE_ID) values (5000,null,'org.apache.hadoop.mapred.TextInputFormat','N','N','file:///user/admin/2016_11_18',-1,'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',null)", "insert into TBLS(TBL_ID,CREATE_TIME,DB_ID,LAST_ACCESS_TIME,OWNER,RETENTION,SD_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT,VIEW_ORIGINAL_TEXT,IS_REWRITE_ENABLED) values (3000 ,1435255431,2,0 ,'hive',0,3000,'mytal3000','MANAGED_TABLE',NULL,NULL,'n')", - "insert into PARTITiONS(PART_ID,CREATE_TIME,LAST_ACCESS_TIME, PART_NAME,SD_ID,TBL_ID) values(5000, 1441402388,0, 'd1=1/d2=5000',5000,2)" + "insert into PARTITIONS(PART_ID,CREATE_TIME,LAST_ACCESS_TIME, PART_NAME,SD_ID,TBL_ID) values(5000, 1441402388,0, 'd1=1/d2=5000',5000,2)", + "insert into SKEWED_STRING_LIST values(1)", + "insert into SKEWED_STRING_LIST values(2)", + "insert into SKEWED_COL_VALUE_LOC_MAP values(1,1,'hdfs://yourhost.com:8020/user/hive/warehouse/mytal/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/')", + "insert into SKEWED_COL_VALUE_LOC_MAP values(2,2,'file:///user/admin/warehouse/mytal/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/')" }; scriptFile = generateTestScript(scripts); schemaTool.runBeeLine(scriptFile.getPath());