This is an automated email from the ASF dual-hosted git repository. anishek pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new d5ea2f3 HIVE-24526: Get grouped locations of external table data using metatool. (Arko Sharma, reviewed by Pravin Kumar Sinha ) d5ea2f3 is described below commit d5ea2f3bb81cd992ce2cf6ad1da23fc4db67c471 Author: Anishek Agarwal <anis...@gmail.com> AuthorDate: Tue Jan 5 16:38:42 2021 +0530 HIVE-24526: Get grouped locations of external table data using metatool. (Arko Sharma, reviewed by Pravin Kumar Sinha ) --- .../metastore/tools/metatool/TestHiveMetaTool.java | 312 +++++++++- .../metastore/tools/metatool/HiveMetaTool.java | 4 + .../tools/metatool/HiveMetaToolCommandLine.java | 59 +- .../tools/metatool/MetaToolTaskDiffExtTblLocs.java | 161 +++++ .../tools/metatool/MetaToolTaskListExtTblLocs.java | 668 +++++++++++++++++++++ .../metatool/TestHiveMetaToolCommandLine.java | 30 +- .../metatool/TestMetaToolTaskListExtTblLocs.java | 291 +++++++++ 7 files changed, 1518 insertions(+), 7 deletions(-) diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java index 81b7ff0..22e3fe0 100644 --- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java +++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaTool.java @@ -19,15 +19,24 @@ package org.apache.hadoop.hive.metastore.tools.metatool; import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import java.nio.file.Files; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; - +import org.json.JSONObject; +import org.json.JSONArray; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; import org.apache.hadoop.hive.metastore.api.Database; @@ -35,12 +44,25 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil; +import org.apache.hadoop.hive.metastore.txn.TxnStore; +import org.apache.hadoop.hive.metastore.txn.TxnUtils; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.hive.ql.io.HiveInputFormat; +import org.apache.hadoop.hive.ql.processors.CommandProcessorException; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.QueryState; +import org.apache.hadoop.hive.ql.Driver; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.AvroTableProperties; import org.apache.hadoop.util.StringUtils; import org.apache.thrift.TException; +import org.junit.Assert; import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertEquals; +import com.google.gson.JsonParser; +import org.json.JSONObject; import org.junit.Before; import org.junit.After; import org.junit.Test; @@ -57,7 +79,12 @@ public class TestHiveMetaTool { private HiveMetaStoreClient client; private OutputStream os; - + protected Driver d; + protected TxnStore txnHandler; + private static HiveConf hiveConf; + private static final String TEST_DATA_DIR = new File(System.getProperty("java.io.tmpdir") + + File.separator + TestHiveMetaTool.class.getCanonicalName() + "-" + System.currentTimeMillis() + ).getPath().replaceAll("\\\\", "/"); @Before public void setUp() throws Exception { @@ -66,19 +93,60 @@ public class TestHiveMetaTool { os = new ByteArrayOutputStream(); System.setOut(new PrintStream(os)); - HiveConf hiveConf = new HiveConf(HiveMetaTool.class); + hiveConf = new HiveConf(HiveMetaTool.class); client = new HiveMetaStoreClient(hiveConf); createDatabase(); createTable(); client.close(); + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + hiveConf.set("mapred.local.dir", workDir + File.separator + this.getClass().getSimpleName() + + File.separator + "mapred" + File.separator + "local"); + hiveConf.set("mapred.system.dir", workDir + File.separator + this.getClass().getSimpleName() + + File.separator + "mapred" + File.separator + "system"); + hiveConf.set("mapreduce.jobtracker.staging.root.dir", workDir + File.separator + this.getClass().getSimpleName() + + File.separator + "mapred" + File.separator + "staging"); + hiveConf.set("mapred.temp.dir", workDir + File.separator + this.getClass().getSimpleName() + + File.separator + "mapred" + File.separator + "temp"); + hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, ""); + hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, ""); + hiveConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, getWarehouseDir()); + hiveConf.setVar(HiveConf.ConfVars.HIVEINPUTFORMAT, HiveInputFormat.class.getName()); + hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, + "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory"); + hiveConf.setBoolVar(HiveConf.ConfVars.MERGE_CARDINALITY_VIOLATION_CHECK, true); + HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.MERGE_SPLIT_UPDATE, true); + hiveConf.setBoolVar(HiveConf.ConfVars.HIVESTATSCOLAUTOGATHER, false); + hiveConf.setBoolean("mapred.input.dir.recursive", true); + TestTxnDbUtil.setConfValues(hiveConf); + txnHandler = TxnUtils.getTxnStore(hiveConf); + TestTxnDbUtil.prepDb(hiveConf); + File f = new File(getWarehouseDir()); + if (f.exists()) { + FileUtil.fullyDelete(f); + } + if (!(new File(getWarehouseDir()).mkdirs())) { + throw new RuntimeException("Could not create " + getWarehouseDir()); + } + SessionState ss = SessionState.start(hiveConf); + ss.applyAuthorizationPolicy(); + d = new Driver(new QueryState.Builder().withHiveConf(hiveConf).nonIsolated().build()); + d.setMaxRows(10000); } catch (Exception e) { System.err.println("Unable to setup the hive metatool test"); System.err.println(StringUtils.stringifyException(e)); throw new Exception(e); } } + protected String getWarehouseDir() { + return getTestDataDir() + "/warehouse"; + } + + private String getTestDataDir() { + return TEST_DATA_DIR; + } private void createDatabase() throws Exception { if (client.getAllDatabases().contains(DB_NAME)) { @@ -142,17 +210,255 @@ public class TestHiveMetaTool { checkAvroSchemaURLProps(AVRO_URI); } + /* + * Tests -listExtTblLocs option on various input combinations. + */ + @Test + public void testListExtTblLocs() throws Exception { + String extTblLocation = getTestDataDir() + "/ext"; + String outLocation = getTestDataDir() + "/extTblOutput/"; + Configuration conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setVar(conf, MetastoreConf.ConfVars.WAREHOUSE_EXTERNAL, getWarehouseDir()); + MetaToolTaskListExtTblLocs.msConf = conf; + + // Case 1 : Check default locations + // Inputs : db1, db2 in default locations, db3 in custom location + // Expected outputs: default locations for db1, db2 and custom location for db3 after aggregation + runStatementOnDriver("create database db1"); + runStatementOnDriver("create database db2"); + runStatementOnDriver("create database db3"); + runStatementOnDriver("create external table db1.ext(a int) partitioned by (p int)"); + runStatementOnDriver("create external table db2.ext(a int) partitioned by (p int)"); + runStatementOnDriver("create external table db3.ext(a int) partitioned by (p int) " + + "location '" + getTestDataDir() + "/ext/tblLoc'"); + runStatementOnDriver("alter table db3.ext add partition(p = 0) location '" + getTestDataDir() + "/part'" ); + runStatementOnDriver("alter table db3.ext add partition(p = 1) location '" + getTestDataDir() + "/part'" ); + JSONObject outJS = getListExtTblLocs("db*", outLocation); + //confirm default locations + Set<String> outLocationSet = outJS.keySet(); + String expectedOutLoc1 = getAbsolutePath(getWarehouseDir() + "/db1.db"); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc1)); + Assert.assertEquals(outLocationSet.size(), 4); + JSONArray outArr = outJS.getJSONArray(expectedOutLoc1); + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("db1.ext")); + String expectedOutLoc2 = getAbsolutePath(getWarehouseDir() + "/db2.db"); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc2)); + outArr = outJS.getJSONArray(expectedOutLoc2); + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("db2.ext")); + String expectedOutLoc3 = getAbsolutePath(getTestDataDir() + "/part"); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc3)); + outArr = outJS.getJSONArray(expectedOutLoc3); + Assert.assertEquals(outArr.length(), 2); + Assert.assertTrue(outArr.getString(0).equals("db3.ext.p=0")); + Assert.assertTrue(outArr.getString(1).equals("db3.ext.p=1")); + String expectedOutLoc4 = getAbsolutePath(getTestDataDir() + "/ext/tblLoc"); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc4)); + outArr = outJS.getJSONArray(expectedOutLoc4); + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("db3.ext p(0/2)")); + + + // Case 2 : Check with special chars in partition-names : including quotes, timestamp formats, spaces, backslash etc. + // Also checks count of partitions in tbl-location. + // inputs (default database) + // ../ext/t1 - table1 location containing 3/5 partitions + // ../ext/t2 - table2 location containining 2/4 partitions + // ../ext/dir1/dir2/dir3 - 2 partitions of table1, 1 partition of table2, table loc of table3 with 0 partitions. + // ../ext - partitions of table3 + // expected output : [../ext/t1, ../ext/t2, ../ext/dir1/dir2/dir3/t1_parts (2 partitions), ../ext/dir1/dir2/dir3/t2_parts(2 partitions), + // .../ext/dir1/dir2/dir3/t3 (0 parittions), ../ext/t3_parts (1 partition) ] + // Doesn't contain default database location as there are no entities in default location in this case, + // all data is under some custom location (../ext) + runStatementOnDriver("drop table ext"); + runStatementOnDriver("create external table ext(a int) partitioned by (p varchar(3)) " + + "location '" + getTestDataDir() + "/ext/t1'"); + runStatementOnDriver("create external table ext2(a int) partitioned by (flt string, dbl string) " + + "location '" + getTestDataDir() + "/ext/t2'"); + runStatementOnDriver("create external table ext3(a int) partitioned by (dt string, timeSt string) " + + "location '" + getTestDataDir() + "/ext/dir1/dir2/dir3/t3'"); + runStatementOnDriver("alter table ext add partition(p = 'A')"); + runStatementOnDriver("alter table ext add partition(p = 'B')"); + runStatementOnDriver("alter table ext add partition(p = 'UK')" ); + runStatementOnDriver("alter table ext2 add partition(flt = '0.0', dbl = '0')"); + runStatementOnDriver("alter table ext2 add partition(flt = '0.1', dbl = '1.1')"); + runStatementOnDriver("alter table ext3 add partition(dt = '2020-12-01', timeSt = '23:23:23') location '" + + getTestDataDir() + "/ext/t3_parts'" ); + runStatementOnDriver("alter table ext3 add partition(dt = '2020-12-02', timeSt = '22:22:22') location '" + + getTestDataDir() + "/ext/t3_parts'" ); + runStatementOnDriver("alter table ext3 add partition(dt = '2020-12-03', timeSt = '21:21:21.1234') location '" + + getTestDataDir() + "/ext/t3_parts'" ); + runStatementOnDriver("alter table ext add partition(p = \'A\\\\\') location '" + + getTestDataDir() + "/ext/dir1/dir2/dir3/t1_parts'" ); + runStatementOnDriver("alter table ext add partition(p = \' A\"\') location '" + + getTestDataDir() + "/ext/dir1/dir2/dir3/t1_parts'" ); + runStatementOnDriver("alter table ext2 add partition(flt = '0.1', dbl='3.22') location '" + + getTestDataDir() + "/ext/dir1/dir2/dir3/t2_parts'"); + runStatementOnDriver("alter table ext2 add partition(flt = '0.22', dbl = '2.22') location '" + + getTestDataDir() + "/ext/dir1/dir2/dir3/t2_parts'"); + + + outJS = getListExtTblLocs("default", outLocation); + expectedOutLoc1 = getAbsolutePath(extTblLocation + "/t1"); + expectedOutLoc2 = getAbsolutePath(extTblLocation + "/t2"); + expectedOutLoc3 = getAbsolutePath(extTblLocation + "/dir1/dir2/dir3/t1_parts"); + expectedOutLoc4 = getAbsolutePath(extTblLocation + "/dir1/dir2/dir3/t2_parts"); + String expectedOutLoc5 = getAbsolutePath(extTblLocation + "/dir1/dir2/dir3/t3"); + String expectedOutLoc6 = getAbsolutePath(extTblLocation + "/t3_parts"); + + outLocationSet = outJS.keySet(); + Assert.assertEquals(outLocationSet.size(), 6); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc1)); + outArr = outJS.getJSONArray(expectedOutLoc1); //t1 + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("default.ext p(3/5)")); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc2)); + outArr = outJS.getJSONArray(expectedOutLoc2); //t2 + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("default.ext2 p(2/4)")); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc3)); //t1_parts + outArr = outJS.getJSONArray(expectedOutLoc3); + Assert.assertEquals(outArr.length(), 2); + Assert.assertTrue(outArr.getString(0).equals("default.ext.p= A%22")); //spaces, quotes + Assert.assertTrue(outArr.getString(1).equals("default.ext.p=A%5C")); //backslash + Assert.assertTrue(outLocationSet.contains(expectedOutLoc4)); //t2_parts + outArr = outJS.getJSONArray(expectedOutLoc4); + Assert.assertEquals(outArr.length(), 2); + Assert.assertTrue(outArr.getString(0).equals("default.ext2.flt=0.1/dbl=3.22")); //periods, slash + Assert.assertTrue(outArr.getString(1).equals("default.ext2.flt=0.22/dbl=2.22")); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc5)); //t3 + outArr = outJS.getJSONArray(expectedOutLoc5); + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("default.ext3 p(0/3)")); + Assert.assertTrue(outLocationSet.contains(expectedOutLoc6)); //t3_parts + outArr = outJS.getJSONArray(expectedOutLoc6); + Assert.assertEquals(outArr.length(), 3); + Assert.assertTrue(outArr.getString(0).equals("default.ext3.dt=2020-12-01/timest=23%3A23%3A23")); //date, timestamp formats + Assert.assertTrue(outArr.getString(1).equals("default.ext3.dt=2020-12-02/timest=22%3A22%3A22")); + Assert.assertTrue(outArr.getString(2).equals("default.ext3.dt=2020-12-03/timest=21%3A21%3A21.1234")); + } + + /* + * Tests -diffExtTblLocs option on various input combinations. + */ + @Test + public void testDiffExtTblLocs() throws Exception { + String extTblLocation = getTestDataDir() + "/ext"; + String outLocation = getTestDataDir() + "/extTblOutput"; + Configuration conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setVar(conf, MetastoreConf.ConfVars.WAREHOUSE_EXTERNAL, getWarehouseDir()); + MetaToolTaskListExtTblLocs.msConf = conf; + + //create first file using -listExtTblLocs + runStatementOnDriver("create database diffDb"); + runStatementOnDriver("create external table diffDb.ext1(a int) partitioned by (p int)"); + runStatementOnDriver("create external table diffDb.ext2(a int) partitioned by (p int)"); + runStatementOnDriver("create external table diffDb.ext3(a int) partitioned by (p int) " + + "location '" + getTestDataDir() + "/ext/tblLoc'"); + runStatementOnDriver("alter table diffDb.ext1 add partition(p = 0) location '" + getTestDataDir() + "/part'" ); + runStatementOnDriver("alter table diffDb.ext1 add partition(p = 1) location '" + getTestDataDir() + "/part'" ); + String outLocation1 = outLocation + "1"; + getListExtTblLocs("diffDb", outLocation1); + + //create second file using -listExtTblLocs after dropping a table, dropping a partition and adding a different partition + runStatementOnDriver("drop table diffDb.ext2"); + runStatementOnDriver("alter table diffDb.ext1 drop partition(p = 0)" ); + runStatementOnDriver("alter table diffDb.ext1 add partition(p = 3) location '" + getTestDataDir() + "/part'" ); + String outLocation2 = outLocation + "2"; + getListExtTblLocs("diffDb", outLocation2); + + //run diff on the above two files + JSONObject outJS = getDiffExtTblLocs(outLocation1, outLocation2, outLocation); + Set<String> outLocationSet = outJS.keySet(); + String defaultDbLoc = getAbsolutePath(getWarehouseDir() + "/diffdb.db"); + Assert.assertEquals(outLocationSet.size(), 2); + Assert.assertTrue(outLocationSet.contains(defaultDbLoc)); + JSONArray outArr = outJS.getJSONArray(defaultDbLoc); + Assert.assertEquals(outArr.length(), 1); + Assert.assertTrue(outArr.getString(0).equals("- diffdb.ext2")); // dropped ext2 from default location + String partLoc = getAbsolutePath(getTestDataDir() + "/part"); + Assert.assertTrue(outLocationSet.contains(partLoc)); + outArr = outJS.getJSONArray(partLoc); + Assert.assertEquals(outArr.length(), 2); //two entries - 1 for added partition and 1 for dropped partition + Assert.assertTrue(outArr.getString(0).equals("+ diffdb.ext1.p=3")); + Assert.assertTrue(outArr.getString(1).equals("- diffdb.ext1.p=0")); + } + + private String getAbsolutePath(String extTblLocation) { + return "file:" + extTblLocation; + } + + private JSONObject getListExtTblLocs(String dbName, String outLocation) throws IOException { + File f = new File(outLocation); + if (f.exists()) { + FileUtil.fullyDelete(f); + } + if (!(new File(outLocation).mkdirs())) { + throw new RuntimeException("Could not create " + outLocation); + } + HiveMetaTool.main(new String[] {"-listExtTblLocs", dbName, outLocation}); + for (File outFile : f.listFiles()) { + String contents = new String(Files.readAllBytes(Paths.get(outFile.getAbsolutePath()))); + return new JSONObject(contents); + } + return null; + } + + private JSONObject getDiffExtTblLocs(String fileLoc1, String fileLoc2, String outLocation) throws IOException { + File f = new File(outLocation); + if (f.exists()) { + FileUtil.fullyDelete(f); + } + if (!(new File(outLocation).mkdirs())) { + throw new RuntimeException("Could not create " + outLocation); + } + File f1 = new File(fileLoc1); + File f2 = new File(fileLoc2); + for (File outFile1 : f1.listFiles()) { + for (File outFile2 : f2.listFiles()) { + HiveMetaTool.main(new String[] {"-diffExtTblLocs", outFile1.getAbsolutePath(), outFile2.getAbsolutePath(), outLocation}); + for(File outFile : f.listFiles()) { + String contents = new String(Files.readAllBytes(Paths.get(outFile.getAbsolutePath()))); + return new JSONObject(contents); + } + } + } + return null; + } + private void checkAvroSchemaURLProps(String expectedUri) throws TException { Table table = client.getTable(DB_NAME, TABLE_NAME); assertEquals(expectedUri, table.getParameters().get(AvroTableProperties.SCHEMA_URL.getPropName())); assertEquals(expectedUri, table.getSd().getParameters().get(AvroTableProperties.SCHEMA_URL.getPropName())); } + protected List<String> runStatementOnDriver(String stmt) throws Exception { + try { + d.run(stmt); + } catch (CommandProcessorException e) { + throw new RuntimeException(stmt + " failed: " + e); + } + List<String> rs = new ArrayList<>(); + d.getResults(rs); + return rs; + } + @After public void tearDown() throws Exception { try { client.dropTable(DB_NAME, TABLE_NAME); client.dropDatabase(DB_NAME); + try { + if (d != null) { + d.close(); + d.destroy(); + d = null; + } + } finally { + TestTxnDbUtil.cleanDb(hiveConf); + FileUtils.deleteDirectory(new File(getTestDataDir())); + } client.close(); } catch (Throwable e) { diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java index 760d78d..913146e 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaTool.java @@ -50,6 +50,10 @@ public final class HiveMetaTool { task = new MetaToolTaskExecuteJDOQLQuery(); } else if (cl.isUpdateLocation()) { task = new MetaToolTaskUpdateLocation(); + } else if (cl.isListExtTblLocs()) { + task = new MetaToolTaskListExtTblLocs(); + } else if (cl.isDiffExtTblLocs()) { + task = new MetaToolTaskDiffExtTblLocs(); } else { throw new IllegalArgumentException("No task was specified!"); } diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java index 1223f0d..ce43a8c 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/HiveMetaToolCommandLine.java @@ -58,6 +58,25 @@ class HiveMetaToolCommandLine { .create("updateLocation"); @SuppressWarnings("static-access") + private static final Option LIST_EXT_TBL_LOCS = OptionBuilder + .withArgName("dbName> " + " <output-loc") + .hasArgs(2) + .withDescription("Generates a file containing a list of directories which cover external table data locations " + + "for the specified database. A database name or pattern must be specified, on which the tool will be run." + + "The output is generated at the specified location." + ) + .create("listExtTblLocs"); + + @SuppressWarnings("static-access") + private static final Option DIFF_EXT_TBL_LOCS = OptionBuilder + .withArgName("file1> " + " <file2> " + "<output-loc") + .hasArgs(3) + .withDescription("Generates the difference between two output-files created using -listExtTblLocs option at the" + + " specified location. Output contains locations(keys) unique to each input file. For keys common to both " + + "input-files, those entities are listed which are deleted from the first file and introduced in the second." + ) + .create("diffExtTblLocs"); + private static final Option DRY_RUN = OptionBuilder .withDescription("Perform a dry run of updateLocation changes.When run with the dryRun option updateLocation " + "changes are displayed but not persisted. dryRun is valid only with the updateLocation option.") @@ -93,6 +112,8 @@ class HiveMetaToolCommandLine { OPTIONS.addOption(LIST_FS_ROOT); OPTIONS.addOption(EXECUTE_JDOQL); OPTIONS.addOption(UPDATE_LOCATION); + OPTIONS.addOption(LIST_EXT_TBL_LOCS); + OPTIONS.addOption(DIFF_EXT_TBL_LOCS); OPTIONS.addOption(DRY_RUN); OPTIONS.addOption(SERDE_PROP_KEY); OPTIONS.addOption(TABLE_PROP_KEY); @@ -102,6 +123,8 @@ class HiveMetaToolCommandLine { private boolean listFSRoot; private String jdoqlQuery; private String[] updateLocationParams; + private String[] listExtTblLocsParams; + private String[] diffExtTblLocsParams; private boolean dryRun; private String serdePropKey; private String tablePropKey; @@ -137,14 +160,18 @@ class HiveMetaToolCommandLine { listFSRoot = cl.hasOption(LIST_FS_ROOT.getOpt()); jdoqlQuery = cl.getOptionValue(EXECUTE_JDOQL.getOpt()); updateLocationParams = cl.getOptionValues(UPDATE_LOCATION.getOpt()); + listExtTblLocsParams = cl.getOptionValues(LIST_EXT_TBL_LOCS.getOpt()); + diffExtTblLocsParams = cl.getOptionValues(DIFF_EXT_TBL_LOCS.getOpt()); dryRun = cl.hasOption(DRY_RUN.getOpt()); serdePropKey = cl.getOptionValue(SERDE_PROP_KEY.getOpt()); tablePropKey = cl.getOptionValue(TABLE_PROP_KEY.getOpt()); help = cl.hasOption(HELP.getOpt()); - int commandCount = (isListFSRoot() ? 1 : 0) + (isExecuteJDOQL() ? 1 : 0) + (isUpdateLocation() ? 1 : 0); + int commandCount = (isListFSRoot() ? 1 : 0) + (isExecuteJDOQL() ? 1 : 0) + (isUpdateLocation() ? 1 : 0) + + (isListExtTblLocs() ? 1 : 0) + (isDiffExtTblLocs() ? 1 : 0); if (commandCount != 1) { - throw new IllegalArgumentException("exectly one of -listFSRoot, -executeJDOQL, -updateLocation must be set"); + throw new IllegalArgumentException("exactly one of -listFSRoot, -executeJDOQL, -updateLocation, " + + "-listExtTblLocs, -diffExtTblLocs must be set"); } if (updateLocationParams != null && updateLocationParams.length != 2) { @@ -152,6 +179,16 @@ class HiveMetaToolCommandLine { updateLocationParams.length + " arguments"); } + if (listExtTblLocsParams != null && listExtTblLocsParams.length != 2) { + throw new IllegalArgumentException("HiveMetaTool:listExtTblLocs takes in 2 arguments but was passed " + + listExtTblLocsParams.length + " arguments"); + } + + if (diffExtTblLocsParams != null && diffExtTblLocsParams.length != 3) { + throw new IllegalArgumentException("HiveMetaTool:diffExtTblLocs takes in 3 arguments but was passed " + + diffExtTblLocsParams.length + " arguments"); + } + if ((dryRun || serdePropKey != null || tablePropKey != null) && !isUpdateLocation()) { throw new IllegalArgumentException("-dryRun, -serdePropKey, -tablePropKey may be used only for the " + "-updateLocation command"); @@ -176,6 +213,8 @@ class HiveMetaToolCommandLine { "\tlistFSRoot : " + listFSRoot + "\n" + "\tjdoqlQuery : " + jdoqlQuery + "\n" + "\tupdateLocation: " + Arrays.toString(updateLocationParams) + "\n" + + "\tlistExtTblLocs: " + Arrays.toString(listExtTblLocsParams) + "\n" + + "\tdiffExtTblLocs: " + Arrays.toString(diffExtTblLocsParams) + "\n" + "\tdryRun : " + dryRun + "\n" + "\tserdePropKey : " + serdePropKey + "\n" + "\ttablePropKey : " + tablePropKey); @@ -197,10 +236,26 @@ class HiveMetaToolCommandLine { return updateLocationParams != null; } + boolean isListExtTblLocs() { + return listExtTblLocsParams != null; + } + + boolean isDiffExtTblLocs() { + return diffExtTblLocsParams != null; + } + String[] getUpddateLocationParams() { return updateLocationParams; } + String[] getListExtTblLocsParams() { + return listExtTblLocsParams; + } + + String[] getDiffExtTblLocsParams() { + return diffExtTblLocsParams; + } + boolean isDryRun() { return dryRun; } diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskDiffExtTblLocs.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskDiffExtTblLocs.java new file mode 100644 index 0000000..90b676d --- /dev/null +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskDiffExtTblLocs.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore.tools.metatool; + +import org.codehaus.jettison.json.JSONArray; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class MetaToolTaskDiffExtTblLocs extends MetaToolTask { + @Override + void execute() { + String[] args = getCl().getDiffExtTblLocsParams(); + try { + File file1 = new File(args[0]); + File file2 = new File(args[1]); + String ouputDir = args[2]; + String outFileName = "diff_" + System.currentTimeMillis(); + System.out.println("Writing diff to " + outFileName); + if (!file1.exists()) { + System.out.println("Input " + args[0] + " does not exist."); + return; + } + if (!file2.exists()) { + System.out.println("Input " + args[1] + " does not exist."); + return; + } + JSONObject jsonObject = getDiffJson(file1, file2); + FileWriter fw = new FileWriter(ouputDir + "/" + outFileName); + PrintWriter pw = new PrintWriter(fw); + pw.println(jsonObject.toString(4).replace("\\", "")); + pw.close(); + } catch (Exception e) { + System.out.println("Generating diff failed: \n" + e.getMessage()); + } + } + + private JSONObject getDiffJson(File file1, File file2) throws IOException, JSONException { + JSONObject inJson1 = new JSONObject(new String(Files.readAllBytes(Paths.get(file1.getAbsolutePath())))); + JSONObject inJson2 = new JSONObject(new String(Files.readAllBytes(Paths.get(file2.getAbsolutePath())))); + Map<String, HashSet<String>> modifiedLocations = new HashMap<>(); + Set<String> keySet1 = getKeySet(inJson1); + Set<String> keySet2 = getKeySet(inJson2); + Set<String> uniqueLocationsFile1 = getSetDifference(keySet1, keySet2); + Set<String> uniqueLocationsFile2 = getSetDifference(keySet2, keySet1); + for (String loc : keySet1) { + if (!uniqueLocationsFile1.contains(loc)) { + //common key, we need to compare the values + JSONArray valArr1 = inJson1.getJSONArray(loc); + JSONArray valArr2 = inJson2.getJSONArray(loc); + for (int i = 0; i < valArr1.length(); i++) { + String val1 = valArr1.getString(i); + boolean absentFromSecondKey = true; + for (int j = 0; j < valArr2.length(); j++) { + String val2 = valArr2.getString(j); + if (val1.equalsIgnoreCase(val2)) { + absentFromSecondKey = false; + break; + } + } + if (absentFromSecondKey) { + if (modifiedLocations.containsKey(loc)) { + modifiedLocations.get(loc).add(asDeleted(val1)); + } else { + modifiedLocations.put(loc, new HashSet<>()); + modifiedLocations.get(loc).add(asDeleted(val1)); + } + } + } + for (int i = 0; i < valArr2.length(); i++) { + String val2 = valArr2.getString(i); + boolean absentFromFirstKey = true; + for (int j = 0; j < valArr1.length(); j++) { + String val1 = valArr1.getString(j); + if (val1.equalsIgnoreCase(val2)) { + absentFromFirstKey = false; + break; + } + } + if (absentFromFirstKey) { + if (modifiedLocations.containsKey(loc)) { + modifiedLocations.get(loc).add(asAdded(val2)); + } else { + modifiedLocations.put(loc, new HashSet<>()); + modifiedLocations.get(loc).add(asAdded(val2)); + } + } + } + } + } + JSONObject jsonObject = new JSONObject(); + if(!uniqueLocationsFile1.isEmpty() || !uniqueLocationsFile2.isEmpty()) { + jsonObject.put("Locations only in " + file1.getName(), uniqueLocationsFile1); + jsonObject.put("Locations only in " + file2.getName(), uniqueLocationsFile2); + } + for(String commonLoc : modifiedLocations.keySet()) { + List<String> modifiedEntries = new ArrayList<>(); + for (String entry : modifiedLocations.get(commonLoc)) { + modifiedEntries.add(entry); + } + Collections.sort(modifiedEntries); + jsonObject.put(commonLoc, modifiedEntries); + } + return jsonObject; + } + + private Set<String> getKeySet(JSONObject jsonObject) { + Iterator<String> keyIter = jsonObject.keys(); + Set<String> keySet = new HashSet(); + while (keyIter.hasNext()) { + keySet.add(keyIter.next()); + } + return keySet; + } + + private Set<String> getSetDifference(Set<String> keySet1, Set<String> keySet2) { + Set<String> diffSet = new HashSet(); + for(String elem : keySet1) { + if(!keySet2.contains(elem)) { + diffSet.add(elem); + } + } + return diffSet; + } + + private String asDeleted(String str) { + return "- " + str; + } + + private String asAdded(String str) { + return "+ " + str; + } +} \ No newline at end of file diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskListExtTblLocs.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskListExtTblLocs.java new file mode 100644 index 0000000..f9d34ee --- /dev/null +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/tools/metatool/MetaToolTaskListExtTblLocs.java @@ -0,0 +1,668 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore.tools.metatool; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.ObjectStore; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.apache.thrift.TException; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONArray; +import org.codehaus.jettison.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.TreeSet; + +public class MetaToolTaskListExtTblLocs extends MetaToolTask { + private static final Logger LOG = LoggerFactory.getLogger(MetaToolTaskListExtTblLocs.class); + private static Configuration conf; + private final Map<String, HashSet<String>> coverageList = new HashMap<>(); //maps each output-location to the set of input-locations covered by it + private final Map<String, DataLocation> inputLocations = new HashMap<>(); //maps each input-location to a DataLocation object which specifies it's properties + + @Override + void execute() { + String[] loc = getCl().getListExtTblLocsParams(); + try{ + generateExternalTableInfo(loc[0], loc[1]); + } catch (IOException | TException | JSONException e) { + System.out.println("Generating external table locations failed: \n" + e.getMessage()); + } + } + + private void generateExternalTableInfo(String dbPattern, String outputDir) throws TException, IOException, + JSONException { + ObjectStore objectStore = getObjectStore(); + conf = msConf != null ? msConf : objectStore.getConf(); + Warehouse wh = new Warehouse(conf); + String defaultCatalog = MetaStoreUtils.getDefaultCatalog(conf); + List<String> databases = objectStore.getDatabases(defaultCatalog, dbPattern); + System.out.println("Number of databases found for given pattern: " + databases.size()); + //maintain the set of leaves of the tree as a sorted set + Set<String> leafLocations = new TreeSet<>(); + for (String db : databases) { + List<String> tables = objectStore.getAllTables(defaultCatalog, db); + Path defaultDbExtPath = wh.getDefaultExternalDatabasePath(db); + String defaultDbExtLocation = defaultDbExtPath.toString(); + boolean isDefaultPathEmpty = true; + for(String tblName : tables) { + Table t = objectStore.getTable(defaultCatalog, db, tblName); + if(TableType.EXTERNAL_TABLE.name().equalsIgnoreCase(t.getTableType())) { + String tblLocation = t.getSd().getLocation(); + Path tblPath = new Path(tblLocation); + if(isPathWithinSubtree(tblPath, defaultDbExtPath)) { + if(isDefaultPathEmpty) { + isDefaultPathEmpty = false; + //default paths should always be included, so we add them as special leaves to the tree + addDefaultPath(defaultDbExtLocation, db); + leafLocations.add(defaultDbExtLocation); + } + HashSet<String> coveredByDefault = coverageList.get(defaultDbExtLocation); + coveredByDefault.add(tblLocation); + } else if (!isCovered(leafLocations, tblPath)) { + leafLocations.add(tblLocation); + } + DataLocation dataLocation = new DataLocation(db, tblName, 0, 0, + null); + inputLocations.put(tblLocation, dataLocation); + dataLocation.setSizeExtTblData(getDataSize(tblPath, conf)); + //retrieving partition locations outside table-location + Map<String, String> partitionLocations = objectStore.getPartitionLocations(defaultCatalog, db, tblName, + tblLocation, -1); + dataLocation.setTotalPartitions(partitionLocations.size()); + for (String partitionName : partitionLocations.keySet()) { + String partLocation = partitionLocations.get(partitionName); + //null value means partition is in table location, we do not add it to input in this case. + if(partLocation == null) { + dataLocation.incrementNumPartsInTblLoc(); + } + else { + partLocation = partLocation + Path.SEPARATOR + + Warehouse.makePartName(Warehouse.makeSpecFromName(partitionName), false); + Path partPath = new Path(partLocation); + long partDataSize = getDataSize(partPath, conf); + if (isPathWithinSubtree(partPath, defaultDbExtPath)) { + if (isDefaultPathEmpty) { + isDefaultPathEmpty = false; + addDefaultPath(defaultDbExtLocation, db); + leafLocations.add(defaultDbExtLocation); + } + if (isPathWithinSubtree(partPath, tblPath)) { + //even in non-null case, handle the corner case where location is set to table-location + //In this case, partition would be covered by table location itself, so we need not add to input + dataLocation.incrementNumPartsInTblLoc(); + } else { + DataLocation partObj = new DataLocation(db, tblName, 0, 0, partitionName); + partObj.setSizeExtTblData(partDataSize); + inputLocations.put(partLocation, partObj); + coverageList.get(defaultDbExtLocation).add(partLocation); + } + } else { + if (isPathWithinSubtree(partPath, tblPath)) { + dataLocation.incrementNumPartsInTblLoc(); + } else { + //only in this case, partition location is neither inside table nor in default location. + //So we add it to the graph as a separate leaf. + DataLocation partObj = new DataLocation(db, tblName, 0, 0, partitionName); + partObj.setSizeExtTblData(partDataSize); + inputLocations.put(partLocation, partObj); + if(!isCovered(leafLocations, partPath)) { + leafLocations.add(partLocation); + } + } + } + } + } + } + } + } + if(!leafLocations.isEmpty()) { + removeNestedStructure(leafLocations); + createOutputList(leafLocations, outputDir, dbPattern); + } + else { + System.out.println("No external tables found to process."); + } + } + + private void addDefaultPath(String defaultDbExtLocation, String dbName) { + coverageList.put(defaultDbExtLocation, new HashSet<>()); + DataLocation defaultDatalocation = new DataLocation(dbName, null, 0, 0, null); + //mark default leaves to always be included in output-list + defaultDatalocation.setIncludeByDefault(true); + inputLocations.put(defaultDbExtLocation, defaultDatalocation); + } + + private long getDataSize(Path location, Configuration conf) throws IOException { + if(location == null) { + return 0; + } + if(MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.HIVE_IN_TEST)) { + return testDatasizes == null ? 0 : testDatasizes.containsKey(location.toString()) ? + testDatasizes.get(location.toString()) : 0; + } + FileSystem fs = location.getFileSystem(conf); + if (fs != null && fs.getUri().getScheme().equals("hdfs")) { + try { + ContentSummary cs = fs.getContentSummary(location); + return cs.getLength(); + } catch (FileNotFoundException e) { + //no data yet in data location but we proceed since data may be added later. + } + } + return 0; + } + + private boolean isPathWithinSubtree(Path path, Path subtree) { + int subtreeDepth = subtree.depth(); + while(path != null){ + if (subtreeDepth > path.depth()) { + return false; + } + if(subtree.equals(path)){ + return true; + } + path = path.getParent(); + } + return false; + } + + + /* + * Method to determine if an existing location covers the given location and record the coverage in output. + */ + private boolean isCovered(Set<String> locations, Path path) { + Path originalPath = new Path(path.toString()); + while(path != null){ + if(locations.contains(path.toString())){ + addCoverage(path, originalPath, true); + return true; + } + path = path.getParent(); + } + return false; + } + + /* + * Method to cover a child node using a parent. + * Removes the child and marks all nodes covered by the child as being covered by the parent. + */ + private void addCoverage(Path parentPath, Path childPath, boolean addChild) { + String childLoc = childPath.toString(); + String parentLoc = parentPath.toString(); + //If the path to be covered should be included by default, then we do not cover it. + //This is because default paths should be individually listed, not covered under some parent. + if(inputLocations.containsKey(childLoc) && inputLocations.get(childLoc).shouldIncludeByDefault()) { + return; + } + HashSet<String> pathsUnderChild = coverageList.get(childLoc); + coverageList.remove(childLoc); + if(coverageList.get(parentLoc) == null) { + coverageList.put(parentLoc, new HashSet<>()); + } + HashSet pathsUnderParent = coverageList.get(parentLoc); + if(addChild) { + pathsUnderParent.add(childPath.toString()); + } + if(pathsUnderChild != null) { + pathsUnderParent.addAll(pathsUnderChild); + } + } + + /* + * Transforms a collection so that no element is an ancestor of another. + */ + private void removeNestedStructure(Set<String> locations) { + List<String> locationList = new ArrayList<>(); + locationList.addAll(locations); + for(int i = 0; i < locationList.size(); i++) { + String currLoc = locationList.get(i); + Path currPath = new Path(currLoc); + for(int j = i + 1; j < locationList.size(); j++) { + String nextLoc = locationList.get(j); + Path nextPath = new Path (nextLoc); + if(isPathWithinSubtree(nextPath, currPath)) { + addCoverage(currPath, nextPath, true); + locations.remove(nextLoc); + i = j; + } + else { + i = j - 1; + break; + } + } + } + } + + /* + * Method to write the output to the given location. + * We construct a tree out of external table - locations and use it to determine suitable directories covering all locations. + */ + private void createOutputList(Set<String> locations, String outputDir, String dbPattern) throws IOException, JSONException { + ExternalTableGraphNode rootNode = constructTree(locations); + //Traverse through the tree in breadth-first manner and decide which nodes to include. + //For every node, either cover all leaves in its subtree using itself + // or delegate this duty to its child nodes. + Queue<ExternalTableGraphNode> queue = new LinkedList<>(); + queue.add(rootNode); + while(!queue.isEmpty()){ + ExternalTableGraphNode current = queue.remove(); + if(current.isLeaf()) { + // in this case, the leaf needs to be added to the solution, i.e. marked as being covered. + // This was done during graph construction, so we continue. + continue; + } + int nonTrivialCoverage = 0; + List<ExternalTableGraphNode> childNodes = current.getChildNodes(); + boolean processChildrenByDefault = false; + for(ExternalTableGraphNode child : childNodes) { + if (child.getNumLeavesCovered() > 1) { + nonTrivialCoverage += child.getNumLeavesCovered(); + } + if (child.shouldIncludeByDefault()) { + processChildrenByDefault = true; + break; + } + } + boolean addCurrToSolution = false; + if(!processChildrenByDefault) { + addCurrToSolution = true; + if (!current.shouldIncludeByDefault()) { + //ensure that we do not have extra data in the current node for it to be included. + long currDataSize = getDataSize(new Path(current.getLocation()), conf); + int numLeavesCovered = current.getNumLeavesCovered(); + //only add current node if it doesn't have extra data and non-trivial coverage is less than half. + //Also we do not add current node if there is just a single path(numLeavesCovered = 1); in this case we proceed to the leaf. + addCurrToSolution &= currDataSize == current.getChildDataSizes() && + ((nonTrivialCoverage < (numLeavesCovered + 1) / 2) && numLeavesCovered != 1); + } + } + if(processChildrenByDefault) { + queue.addAll(childNodes); + } else if (addCurrToSolution) { + addToSolution(current); + } else { + queue.addAll(childNodes); + } + } + String outFileName = "externalTableLocations_" + dbPattern + "_" + System.currentTimeMillis() + ".txt"; + System.out.println("Writing output to " + outFileName); + FileWriter fw = new FileWriter(outputDir + "/" + outFileName); + PrintWriter pw = new PrintWriter(fw); + JSONObject jsonObject = new JSONObject(); + for(String outputLocation : coverageList.keySet()) { + HashSet<String> coveredLocations = coverageList.get(outputLocation); + JSONArray outputEntities = listOutputEntities(coveredLocations); + jsonObject.put(outputLocation, outputEntities); + } + String result = jsonObject.toString(4).replace("\\",""); + pw.println(result); + pw.close(); + } + + /* + * Returns a comma separated list of entities(tables or partition names) covered by to a location. + * Table-name followed by "*" indicates that all partitions are inside table location. + * Otherwise, we record the number of partitions covered by table location. + */ + private JSONArray listOutputEntities(HashSet<String> locations) { + List<String> listEntities = new ArrayList<>(); + for(String loc : locations) { + DataLocation data = inputLocations.get(loc); + String tblName = data.getTblName(); + if(tblName == null) { + continue; + } + String out = data.getDbName() + "." + tblName; + String partName = data .getPartName(); + if (partName == null) { + int numPartInTblLoc = data.getNumPartitionsInTblLoc(); + int totPartitions = data.getTotalPartitions(); + if (totPartitions > 0 && numPartInTblLoc == totPartitions) { + out = out + ".*"; + } + else if (totPartitions > 0) { + out = out + " p(" + numPartInTblLoc + "/" + totPartitions + ")"; + } + } + else { + out = out + "." + partName; + } + listEntities.add(out); + } + Collections.sort(listEntities); + return new JSONArray(listEntities); + } + + private ExternalTableGraphNode constructTree(Set<String> locations) { + ExternalTableGraphNode rootNode = null; + Map<String, ExternalTableGraphNode> locationGraph = new HashMap<>(); + // Every location is represented by a leaf in the tree. + // We traverse through the input locations and construct the tree. + for (String leaf : locations) { + ExternalTableGraphNode currNode = new ExternalTableGraphNode(leaf, new ArrayList<>(), true, 0); + if(inputLocations.containsKey(leaf)) { + if(inputLocations.get(leaf).shouldIncludeByDefault()) { + currNode.setIncludeByDefault(true); + } + currNode.setDataSize(inputLocations.get(leaf).getSizeExtTblData()); + } + locationGraph.put(leaf, currNode); + //initialize coverage-lists of leaves + if (coverageList.get(leaf) == null) { + coverageList.put(leaf, new HashSet<>()); + } + //mark the leaf as being covered by itself + HashSet currCoverage = coverageList.get(leaf); + currCoverage.add(leaf); + //set the number of leaves covered. Nested locations could have been covered earlier during preprocessing, + //so we set it to the size of it's coverage set. + currNode.setNumLeavesCovered(currCoverage.size()); + Path parent = new Path(leaf).getParent(); + ExternalTableGraphNode parNode; + //traverse upward to the root in order to construct the graph + while (parent != null) { + String parentLoc = parent.toString(); + if (!locationGraph.containsKey(parentLoc)) { + //if parent doesn't exist in graph then create it + parNode = new ExternalTableGraphNode(parentLoc, new ArrayList<>(), false, 0); + locationGraph.put(parentLoc, parNode); + } + else { + parNode = locationGraph.get(parentLoc); + parNode.setIsLeaf(false); + } + if(currNode.getParent() == null) { + parNode.addChild(currNode); + currNode.setParent(parNode); + } + else { + break; + } + currNode = parNode; + parent = parent.getParent(); + } + if (parent == null && rootNode == null) { + rootNode = currNode; + rootNode.setParent(rootNode); + } + } + rootNode.updateNumLeavesCovered(); + rootNode.updateIncludeByDefault(); + rootNode.updateDataSize(); + return rootNode; + } + + private void addToSolution(ExternalTableGraphNode node) { + //since this node is in the solution, all its children should be covered using this node. + if(!node.isLeaf()) { + addCoverageRecursive(node); + } + } + + private void addCoverageRecursive(ExternalTableGraphNode node) { + for(ExternalTableGraphNode child : node.getChildNodes()) { + if(child.isLeaf()) { + addCoverage(new Path(node.getLocation()), new Path(child.getLocation()), true); + } + else { + addCoverageRecursive(child); + addCoverage(new Path(node.getLocation()), new Path(child.getLocation()), false); + } + } + } + + @VisibleForTesting + static Configuration msConf = null; + + @VisibleForTesting + Map<String, Long> testDatasizes = null; + + @VisibleForTesting + public Map<String, HashSet<String>> runTest(Set<String> inputList, Map<String, Long> sizes) { + try { + conf = msConf; + testDatasizes = sizes; + coverageList.clear(); + removeNestedStructure(inputList); + createOutputList(inputList, "test", "test"); + } catch (Exception e) { + LOG.error("MetaToolTask failed on ListExtTblLocs test: ", e); + } + return coverageList; + } + + /* + * Class denoting every external table data location. + * Each location can be either a table location(in this case, partition-name is not set) or + * a partition location which is outside table location. + * If the location is a table location, we store additional data like how many partitions are there in the table + * and how many of them are there in the table loc itself. + */ + private class DataLocation { + private String dbName; + private String tblName; + private int numPartitionsInTblLoc; + private String partName; + private int totalPartitions; + // 'sizeExtTblData' stores the size of useful data in a directory. + // This can be compared with total directory-size to ascertain amount of extra data in it. + long sizeExtTblData; + boolean includeByDefault; + + private DataLocation (String dbName, String tblName, int totalPartitions, int numPartitionsInTblLoc, + String partName) { + this.dbName = dbName; + this.tblName = tblName; + this.totalPartitions = totalPartitions; + this.numPartitionsInTblLoc = numPartitionsInTblLoc; + this.partName = partName; + this.sizeExtTblData = 0; + } + + private void incrementNumPartsInTblLoc() { + this.numPartitionsInTblLoc++; + } + + private String getPartName() { + return this.partName; + } + + private String getDbName() { + return this.dbName; + } + + private String getTblName() { + return this.tblName; + } + + private int getNumPartitionsInTblLoc() { + return this.numPartitionsInTblLoc; + } + + private int getTotalPartitions() { + return this.totalPartitions; + } + + private long getSizeExtTblData() { + return this.sizeExtTblData; + } + + private boolean shouldIncludeByDefault() { + return this.includeByDefault; + } + + private void setTotalPartitions(int totalPartitions) { + this.totalPartitions = totalPartitions; + } + + private void setSizeExtTblData(long sizeExtTblData) { + this.sizeExtTblData = sizeExtTblData; + } + + private void setIncludeByDefault(boolean includeByDefault) { + this.includeByDefault = includeByDefault; + } + } + + private class ExternalTableGraphNode { + private String location; + private List<ExternalTableGraphNode> childNodes; + private ExternalTableGraphNode parent; + private boolean isLeaf; + private boolean includeByDefault; + private int numLeavesCovered; + private long dataSize; + + private ExternalTableGraphNode(String location, List<ExternalTableGraphNode> childNodes, boolean isLeaf, long dataSize) { + this.location = location; + this.childNodes = childNodes; + this.isLeaf = isLeaf; + this.parent = null; + this.includeByDefault = false; + this.dataSize = dataSize; + } + + private void addChild(ExternalTableGraphNode child) { + this.childNodes.add(child); + } + + private List<ExternalTableGraphNode> getChildNodes() { + return this.childNodes; + } + + private boolean isLeaf() { + return this.isLeaf; + } + + public void setIsLeaf(boolean isLeaf) { + this.isLeaf = isLeaf; + } + + private void setNumLeavesCovered(int numLeavesCovered) { + this.numLeavesCovered = numLeavesCovered; + } + + private int getNumLeavesCovered() { + return this.numLeavesCovered; + } + + private String getLocation() { + return this.location; + } + + private void setParent(ExternalTableGraphNode node) { + this.parent = node; + } + + private ExternalTableGraphNode getParent() { + return this.parent; + } + + private boolean shouldIncludeByDefault() { + return this.includeByDefault; + } + + private void setIncludeByDefault(boolean includeByDefault) { + this.includeByDefault = includeByDefault; + } + + private void setDataSize(long dataSize) { + this.dataSize = dataSize; + } + + private long getDataSize() { + return this.dataSize; + } + + private void updateNumLeavesCovered() { + if(this.isLeaf) { + return; + } + this.numLeavesCovered = 0; + for(ExternalTableGraphNode currChild : childNodes) { + currChild.updateNumLeavesCovered(); + this.numLeavesCovered += currChild.getNumLeavesCovered(); + } + } + + /* + * Method to mark all the paths in the subtree rooted at current node which need to be included by default. + * If some leaf has this property, then we mark the path from root to that leaf. + */ + private void updateIncludeByDefault() { + if(this.isLeaf) { + return; + } + for(ExternalTableGraphNode currChild : childNodes) { + currChild.updateIncludeByDefault(); + } + for(ExternalTableGraphNode currChild : childNodes) { + if(currChild.shouldIncludeByDefault()) { + this.includeByDefault = true; + break; + } + } + } + + /* + * Method to update the datasize of subtree rooted at a particular node recursively. + */ + private void updateDataSize() { + if(this.isLeaf) { + return; + } + for(ExternalTableGraphNode currChild : childNodes) { + currChild.updateDataSize(); + } + this.dataSize += this.getChildDataSizes(); + } + + /* + * Method to return sum of data-sizes of child nodes of a particular node + */ + private long getChildDataSizes() { + long sumChildDataSizes = 0; + for(ExternalTableGraphNode currChild : childNodes) { + sumChildDataSizes += currChild.getDataSize(); + } + return sumChildDataSizes; + } + } +} diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java index 9563bd6..ab090c9 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestHiveMetaToolCommandLine.java @@ -44,6 +44,8 @@ public class TestHiveMetaToolCommandLine { assertNull(cl.getJDOQLQuery()); assertFalse(cl.isUpdateLocation()); assertNull(cl.getUpddateLocationParams()); + assertFalse(cl.isListExtTblLocs()); + assertNull(cl.getListExtTblLocsParams()); assertFalse(cl.isDryRun()); assertNull(cl.getSerdePropKey()); assertNull(cl.getTablePropKey()); @@ -57,6 +59,8 @@ public class TestHiveMetaToolCommandLine { assertEquals("select a from b", cl.getJDOQLQuery()); assertFalse(cl.isUpdateLocation()); assertNull(cl.getUpddateLocationParams()); + assertFalse(cl.isListExtTblLocs()); + assertNull(cl.getListExtTblLocsParams()); assertFalse(cl.isDryRun()); assertNull(cl.getSerdePropKey()); assertNull(cl.getTablePropKey()); @@ -73,6 +77,8 @@ public class TestHiveMetaToolCommandLine { assertTrue(cl.isUpdateLocation()); assertEquals("hdfs://new.loc", cl.getUpddateLocationParams()[0]); assertEquals("hdfs://old.loc", cl.getUpddateLocationParams()[1]); + assertFalse(cl.isListExtTblLocs()); + assertNull(cl.getListExtTblLocsParams()); assertTrue(cl.isDryRun()); assertEquals("abc", cl.getSerdePropKey()); assertEquals("def", cl.getTablePropKey()); @@ -81,7 +87,7 @@ public class TestHiveMetaToolCommandLine { @Test public void testNoTask() throws ParseException { exception.expect(IllegalArgumentException.class); - exception.expectMessage("exectly one of -listFSRoot, -executeJDOQL, -updateLocation must be set"); + exception.expectMessage("exactly one of -listFSRoot, -executeJDOQL, -updateLocation, -listExtTblLocs, -diffExtTblLocs must be set"); new HiveMetaToolCommandLine(new String[] {}); } @@ -89,7 +95,7 @@ public class TestHiveMetaToolCommandLine { @Test public void testMultipleTask() throws ParseException { exception.expect(IllegalArgumentException.class); - exception.expectMessage("exectly one of -listFSRoot, -executeJDOQL, -updateLocation must be set"); + exception.expectMessage("exactly one of -listFSRoot, -executeJDOQL, -updateLocation, -listExtTblLocs, -diffExtTblLocs must be set"); new HiveMetaToolCommandLine(new String[] {"-listFSRoot", "-executeJDOQL", "select a from b"}); } @@ -103,6 +109,26 @@ public class TestHiveMetaToolCommandLine { } @Test + public void testListExtTblLocsOneArgument() throws ParseException { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("HiveMetaTool:listExtTblLocs takes in 2 arguments but was passed 1 arguments"); + + new HiveMetaToolCommandLine(new String[] {"-listExtTblLocs", "db1"}); + } + + @Test + public void testDiffExtTblLocsArgCount() throws ParseException { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("HiveMetaTool:diffExtTblLocs takes in 3 arguments but was passed 1 arguments"); + new HiveMetaToolCommandLine(new String[] {"-diffExtTblLocs", "file1"}); + + exception.expect(IllegalArgumentException.class); + exception.expectMessage("HiveMetaTool:diffExtTblLocs takes in 3 arguments but was passed 2 arguments"); + new HiveMetaToolCommandLine(new String[] {"-diffExtTblLocs", "file1", "file2"}); + + } + + @Test public void testDryRunNotAllowed() throws ParseException { exception.expect(IllegalArgumentException.class); exception.expectMessage("-dryRun, -serdePropKey, -tablePropKey may be used only for the -updateLocation command"); diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestMetaToolTaskListExtTblLocs.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestMetaToolTaskListExtTblLocs.java new file mode 100644 index 0000000..4eb3111 --- /dev/null +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/tools/metatool/TestMetaToolTaskListExtTblLocs.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore.tools.metatool; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.util.Set; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.TreeSet; + + +/* Unit tests for MetaToolTaskListExtTblLocs. */ +@Category(MetastoreUnitTest.class) +public class TestMetaToolTaskListExtTblLocs { + + /* + * Test grouping of locations. No extra data assumed. + */ + @Test + public void testGroupLocations() { + Set<String> inputLocations = new TreeSet<>(); + Configuration conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.HIVE_IN_TEST, true); + MetaToolTaskListExtTblLocs.msConf = conf; + MetaToolTaskListExtTblLocs task = new MetaToolTaskListExtTblLocs(); + + //Case 1: Multiple unpartitioned external tables, expected o/p: 1 location + inputLocations.add("/warehouse/customLocation/t1"); + inputLocations.add("/warehouse/customLocation/t2"); + inputLocations.add("/warehouse/customLocation/t3"); + Map<String, HashSet<String>> output = task.runTest(inputLocations, null); + Assert.assertEquals(1, output.size()); + String expectedOutput = "/warehouse/customLocation"; + Assert.assertTrue(output.containsKey(expectedOutput)); + HashSet<String> coveredLocs = output.get(expectedOutput); + Assert.assertEquals(3, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t1")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t2")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t3")); + + //Case 2 : inputs at multiple depths + // inputs ../ext/b0 - contains 1 location + // ../ext/p=0 - contains 1 location + // ../ext/b1/b2/b3 - contains 3 locations (p1, p2, p3) + // expected output : [../ext/b1/b2/b3 containing 3 elements, t1, p0] + inputLocations.clear(); + inputLocations.add("/warehouse/customLocation/ext/b0"); + inputLocations.add("/warehouse/customLocation/ext/p=0"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=1"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=2"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=3"); + output = task.runTest(inputLocations, null); + Assert.assertEquals(3, output.size()); + String expectedOutput1 = "/warehouse/customLocation/ext/b0"; + Assert.assertTrue(output.containsKey(expectedOutput1)); + coveredLocs = output.get(expectedOutput1); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0")); + String expectedOutput2 = "/warehouse/customLocation/ext/p=0"; + Assert.assertTrue(output.containsKey(expectedOutput2)); + coveredLocs = output.get(expectedOutput2); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/p=0")); + String expectedOutput3 = "/warehouse/customLocation/ext/b1/b2/b3"; + Assert.assertTrue(output.containsKey(expectedOutput3)); + coveredLocs = output.get(expectedOutput3); + Assert.assertEquals(3, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=1")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=2")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=3")); + + //Case 3 : root with a lot of leaves + // inputs ../ext/ - contains 4 locations + // ../ext/b1 - contains 3 locations + // expected output : [../ext covering all locations] since root (ext) has more than half of locations + inputLocations.clear(); + inputLocations.add("/warehouse/customLocation/ext/p=0"); + inputLocations.add("/warehouse/customLocation/ext/p=1"); + inputLocations.add("/warehouse/customLocation/ext/p=2"); + inputLocations.add("/warehouse/customLocation/ext/p=3"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=4"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=5"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=6"); + output = task.runTest(inputLocations, null); + Assert.assertEquals(1, output.size()); + expectedOutput = "/warehouse/customLocation/ext"; + Assert.assertTrue(output.containsKey(expectedOutput)); + coveredLocs = output.get(expectedOutput); + Assert.assertEquals(7, coveredLocs.size()); + Assert.assertTrue(coveredLocs.containsAll(inputLocations)); + + //Case 4 : root with a lot of trivial locations (non leaf) + // inputs ../ext/ - contains 4 trivial locations + // ../ext/b1 - contains 3 locations + // expected output : [../ext covering all locations] since non trivial (grouped) locations under ext is less than half + inputLocations.clear(); + inputLocations.add("/warehouse/customLocation/ext/dir01/dir02/p=0"); + inputLocations.add("/warehouse/customLocation/ext/dir11/dir12/p=1"); + inputLocations.add("/warehouse/customLocation/ext/dir21/dir22/p=2"); + inputLocations.add("/warehouse/customLocation/ext/dir31/dir32/p=3"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=4"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=5"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=6"); + output = task.runTest(inputLocations, null); + Assert.assertEquals(1, output.size()); + expectedOutput = "/warehouse/customLocation/ext"; + Assert.assertTrue(output.containsKey(expectedOutput)); + coveredLocs = output.get(expectedOutput); + Assert.assertEquals(7, coveredLocs.size()); + Assert.assertTrue(coveredLocs.containsAll(inputLocations)); + + //Case 5 : several grouped locations and 1 outlier at root + // inputs ../ext/b0 - contains 4 locations + // ../ext/b1 - contains 3 locations + // expected output : [../ext/b0, ../ext/b1, p=7 ] + inputLocations.clear(); + inputLocations.add("/warehouse/customLocation/ext/b0/p=0"); + inputLocations.add("/warehouse/customLocation/ext/b0/p=1"); + inputLocations.add("/warehouse/customLocation/ext/b0/p=2"); + inputLocations.add("/warehouse/customLocation/ext/b0/p=3"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=4"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=5"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=6"); + inputLocations.add("/warehouse/customLocation/ext/p=7"); + output = task.runTest(inputLocations, null); + Assert.assertEquals(3, output.size()); + expectedOutput1 = "/warehouse/customLocation/ext/b0"; + Assert.assertTrue(output.containsKey(expectedOutput1)); + coveredLocs = output.get(expectedOutput1); + Assert.assertEquals(4, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=0")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=1")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=2")); + expectedOutput2 = "/warehouse/customLocation/ext/b1"; + Assert.assertTrue(output.containsKey(expectedOutput2)); + coveredLocs = output.get(expectedOutput2); + Assert.assertEquals(3, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/p=4")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/p=5")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/p=6")); + expectedOutput3 = "/warehouse/customLocation/ext/p=7"; + Assert.assertTrue(output.containsKey(expectedOutput3)); + coveredLocs = output.get(expectedOutput3); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/p=7")); + + //Case 6 : inputs with nested structure + // inputs ../ext/b0 - contains 4 locations + // ../ext/b1 + // ../ext/b1/b2 - contains 4 locations + // expected output : [../ext/b0, ../ext/b1 ] : (no extra location for b2 since covered by b1 itself) + inputLocations.clear(); + inputLocations.add("/warehouse/customLocation/ext/b0/p=0"); + inputLocations.add("/warehouse/customLocation/ext/b0/p=1"); + inputLocations.add("/warehouse/customLocation/ext/b0/p=2"); + inputLocations.add("/warehouse/customLocation/ext/b0/p=3"); + inputLocations.add("/warehouse/customLocation/ext/b1"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/p=7"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/p=8"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/p=9"); + output = task.runTest(inputLocations, null); + Assert.assertEquals(2, output.size()); + expectedOutput1 = "/warehouse/customLocation/ext/b0"; + Assert.assertTrue(output.containsKey(expectedOutput1)); + coveredLocs = output.get(expectedOutput1); + Assert.assertEquals(4, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=0")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=1")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0/p=2")); + expectedOutput2 = "/warehouse/customLocation/ext/b1"; + Assert.assertTrue(output.containsKey(expectedOutput2)); + coveredLocs = output.get(expectedOutput2); + Assert.assertEquals(4, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/p=7")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/p=8")); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/p=9")); + } + + @Test + public void testGroupLocationsDummyDataSizes() { + Set<String> inputLocations = new TreeSet<>(); + Configuration conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.HIVE_IN_TEST, true); + MetaToolTaskListExtTblLocs.msConf = conf; + MetaToolTaskListExtTblLocs task = new MetaToolTaskListExtTblLocs(); + + //Case 1: Multiple unpartitioned external tables, expected o/p without extra data: 1 location (tested in testGroupLocations#1) + // But say there is some data at ../customLocation, then we list all the 3 paths + inputLocations.add("/warehouse/customLocation/t1"); + inputLocations.add("/warehouse/customLocation/t2"); + inputLocations.add("/warehouse/customLocation/t3"); + Map<String, Long> dataSizes = new HashMap<>(); + dataSizes.put("/warehouse/customLocation", Long.valueOf(100)); //Simulate 100 bytes extra data at customLocation + Map<String, HashSet<String>> output = task.runTest(inputLocations, dataSizes); + Assert.assertEquals(3, output.size()); + String expectedOutput1 = "/warehouse/customLocation/t1"; + Assert.assertTrue(output.containsKey(expectedOutput1)); + HashSet<String> coveredLocs = output.get(expectedOutput1); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/t1")); + + //Case 2 : inputs at multiple depths + // inputs ../ext/b0 - contains 1 location + // ../ext/p=0 - contains 1 location + // ../ext/b1/b2/b3 - contains 3 locations (p1, p2, p3) + // expected output without extra data : [../ext/b1/b2/b3 containing 3 elements, t1, p0] (tested in testGroupLocations#2) + // expected output with extra data at ../ext/b1/b2/b3 : [p1, p2, p3, t1, p0] + inputLocations.clear(); + dataSizes.clear(); + inputLocations.add("/warehouse/customLocation/ext/b0"); + inputLocations.add("/warehouse/customLocation/ext/p=0"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=1"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=2"); + inputLocations.add("/warehouse/customLocation/ext/b1/b2/b3/p=3"); + dataSizes.put("/warehouse/customLocation/ext/b1/b2/b3", Long.valueOf(100)); // simulate 100 bytes of extra data at ../b3 + output = task.runTest(inputLocations, dataSizes); + Assert.assertEquals(5, output.size()); + expectedOutput1 = "/warehouse/customLocation/ext/b0"; + Assert.assertTrue(output.containsKey(expectedOutput1)); + coveredLocs = output.get(expectedOutput1); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b0")); + String expectedOutput2 = "/warehouse/customLocation/ext/p=0"; + Assert.assertTrue(output.containsKey(expectedOutput2)); + coveredLocs = output.get(expectedOutput2); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/p=0")); + String expectedOutput3 = "/warehouse/customLocation/ext/b1/b2/b3/p=1"; + Assert.assertTrue(output.containsKey(expectedOutput3)); + coveredLocs = output.get(expectedOutput3); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=1")); + String expectedOutput4 = "/warehouse/customLocation/ext/b1/b2/b3/p=2"; + Assert.assertTrue(output.containsKey(expectedOutput4)); + coveredLocs = output.get(expectedOutput4); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=2")); + String expectedOutput5 = "/warehouse/customLocation/ext/b1/b2/b3/p=3"; + Assert.assertTrue(output.containsKey(expectedOutput5)); + coveredLocs = output.get(expectedOutput5); + Assert.assertEquals(1, coveredLocs.size()); + Assert.assertTrue(coveredLocs.contains("/warehouse/customLocation/ext/b1/b2/b3/p=3")); + + //Case 3 : intermediate directory has extra data + // inputs ../ext/ - contains 4 locations + // ../ext/b1 - contains 3 locations + // expected output without extra data : [../ext covering all locations] (tested in testGroupLocations#3) + // We simulate extra data at ../ext/b1. So, expected output is the list of all locations. + inputLocations.clear(); + dataSizes.clear(); + inputLocations.add("/warehouse/customLocation/ext/p=0"); + inputLocations.add("/warehouse/customLocation/ext/p=1"); + inputLocations.add("/warehouse/customLocation/ext/p=2"); + inputLocations.add("/warehouse/customLocation/ext/p=3"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=4"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=5"); + inputLocations.add("/warehouse/customLocation/ext/b1/p=6"); + dataSizes.put("/warehouse/customLocation/ext/b1", Long.valueOf(100)); // simulate 100 bytes of extra data at ..ext/b1 + dataSizes.put("/warehouse/customLocation/ext", Long.valueOf(100));// since ext/b1 contains 100 bytes, ../ext also has 100 bytes + output = task.runTest(inputLocations, dataSizes); + Assert.assertEquals(7, output.size()); + Assert.assertTrue(output.keySet().containsAll(inputLocations)); + for(String outLoc : output.keySet()) { + Assert.assertTrue(output.get(outLoc).contains(outLoc)); + } + } +} +