deniskuzZ commented on code in PR #6020: URL: https://github.com/apache/hive/pull/6020#discussion_r2278783110
########## standalone-metastore/metastore-client/src/main/java/org/apache/hadoop/hive/metastore/utils/TableFetcher.java: ########## @@ -102,21 +104,47 @@ public List<TableName> getTables() throws Exception { List<String> databases = client.getDatabases(catalogName, dbPattern); for (String db : databases) { - Database database = client.getDatabase(catalogName, db); - if (MetaStoreUtils.checkIfDbNeedsToBeSkipped(database)) { - LOG.debug("Skipping table under database: {}", db); - continue; - } - if (MetaStoreUtils.isDbBeingPlannedFailedOver(database)) { - LOG.info("Skipping table that belongs to database {} being failed over.", db); - continue; - } - List<String> tablesNames = client.listTableNamesByFilter(catalogName, db, tableFilter, -1); + List<String> tablesNames = getTableNamesForDatabase(catalogName, db); tablesNames.forEach(tablesName -> candidates.add(TableName.fromString(tablesName, catalogName, db))); } return candidates; } + public List<Table> getTables(int maxBatchSize) throws Exception { + List<Table> candidates = new ArrayList<>(); + + // if tableTypes is empty, then a list with single empty string has to specified to scan no tables. + if (tableTypes.isEmpty()) { + LOG.info("Table fetcher returns empty list as no table types specified"); + return candidates; + } + + List<String> databases = client.getDatabases(catalogName, dbPattern); + + for (String db : databases) { + List<String> tablesNames = getTableNamesForDatabase(catalogName, db); Review Comment: in order to use batching, you need to have the table list to fetch - that's ok. However, instead of working with the batches you load everything into memory. Could you refactor to use TableIterable (i.e make getTables return Iterable<Table>)? ########## standalone-metastore/metastore-client/src/main/java/org/apache/hadoop/hive/metastore/utils/TableFetcher.java: ########## @@ -102,21 +104,47 @@ public List<TableName> getTables() throws Exception { List<String> databases = client.getDatabases(catalogName, dbPattern); for (String db : databases) { - Database database = client.getDatabase(catalogName, db); - if (MetaStoreUtils.checkIfDbNeedsToBeSkipped(database)) { - LOG.debug("Skipping table under database: {}", db); - continue; - } - if (MetaStoreUtils.isDbBeingPlannedFailedOver(database)) { - LOG.info("Skipping table that belongs to database {} being failed over.", db); - continue; - } - List<String> tablesNames = client.listTableNamesByFilter(catalogName, db, tableFilter, -1); + List<String> tablesNames = getTableNamesForDatabase(catalogName, db); tablesNames.forEach(tablesName -> candidates.add(TableName.fromString(tablesName, catalogName, db))); } return candidates; } + public List<Table> getTables(int maxBatchSize) throws Exception { + List<Table> candidates = new ArrayList<>(); + + // if tableTypes is empty, then a list with single empty string has to specified to scan no tables. + if (tableTypes.isEmpty()) { + LOG.info("Table fetcher returns empty list as no table types specified"); + return candidates; + } + + List<String> databases = client.getDatabases(catalogName, dbPattern); + + for (String db : databases) { + List<String> tablesNames = getTableNamesForDatabase(catalogName, db); Review Comment: in order to use batching, you need to have the table list to fetch - that's ok. However, instead of working with the batches you load everything into memory. Could you refactor to use TableIterable (i.e make getTables return Iterable\<Table\>)? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org