Repository: carbondata Updated Branches: refs/heads/master cc4bc8100 -> 001795ce2
[CARBONDATA-2103] Optimize show tables for filtering datamaps Problem Show tables was taking more time as two times lookup was happening to filter out the datamaps Solution add a hive table property which is true for all tables and false for datamaps like preAggregate table and show tables filter out these tables based on the property. This closes #1980 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/001795ce Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/001795ce Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/001795ce Branch: refs/heads/master Commit: 001795ce28a3295d899deca96909b063aaf95cf1 Parents: cc4bc81 Author: akashrn5 <[email protected]> Authored: Thu Feb 15 19:00:26 2018 +0530 Committer: manishgupta88 <[email protected]> Committed: Thu Feb 22 18:50:35 2018 +0530 ---------------------------------------------------------------------- .../preaggregate/TestPreAggCreateCommand.scala | 9 +++++ .../testsuite/datamap/TestDataMapCommand.scala | 7 +++- .../CreatePreAggregateTableCommand.scala | 2 +- .../table/CarbonCreateTableCommand.scala | 9 ++++- .../command/table/CarbonShowTablesCommand.scala | 40 ++++---------------- 5 files changed, 29 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/001795ce/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggCreateCommand.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggCreateCommand.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggCreateCommand.scala index 6988adc..1e59a80 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggCreateCommand.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggCreateCommand.scala @@ -38,6 +38,7 @@ class TestPreAggCreateCommand extends QueryTest with BeforeAndAfterAll { sql("drop table if exists PreAggMain1") sql("drop table if exists PreAggMain2") sql("drop table if exists maintable") + sql("drop table if exists showTables") sql("create table preaggMain (a string, b string, c string) stored by 'carbondata'") sql("create table preaggMain1 (a string, b string, c string) stored by 'carbondata' tblProperties('DICTIONARY_INCLUDE' = 'a')") sql("create table preaggMain2 (a string, b string, c string) stored by 'carbondata'") @@ -376,6 +377,13 @@ class TestPreAggCreateCommand extends QueryTest with BeforeAndAfterAll { sql("DROP DATAMAP IF EXISTS agg0 ON TABLE maintable") } + test("test show tables filterted with datamaps"){ + sql("create table showTables(name string, age int) stored by 'carbondata'") + sql("create datamap preAgg on table showTables using 'preaggregate' as select sum(age) from showTables") + sql("show tables").show() + assert(!sql("show tables").collect().contains("showTables_preagg")) + } + def getCarbontable(plan: LogicalPlan) : CarbonTable ={ var carbonTable : CarbonTable = null plan.transform { @@ -403,5 +411,6 @@ class TestPreAggCreateCommand extends QueryTest with BeforeAndAfterAll { sql("drop table if exists PreAggMain1") sql("drop table if exists PreAggMain2") sql("drop table if exists maintabletime") + sql("drop table if exists showTables") } } http://git-wip-us.apache.org/repos/asf/carbondata/blob/001795ce/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/TestDataMapCommand.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/TestDataMapCommand.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/TestDataMapCommand.scala index 146ad62..f403b3e 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/TestDataMapCommand.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/TestDataMapCommand.scala @@ -130,9 +130,12 @@ class TestDataMapCommand extends QueryTest with BeforeAndAfterAll { true, "datamap_hiveMetaStoreTable_1") + sql("drop datamap datamap_hiveMetaStoreTable_1 on table hiveMetaStoreTable_1") + checkExistence(sql("show datamap on table hiveMetaStoreTable_1"), + false, + "datamap_hiveMetaStoreTable_1") + assert(sql("show datamap on table hiveMetaStoreTable_1").collect().length == 0) sql("drop table hiveMetaStoreTable_1") - - checkExistence(sql("show tables"), false, "datamap_hiveMetaStoreTable_1") } finally { CarbonProperties.getInstance() http://git-wip-us.apache.org/repos/asf/carbondata/blob/001795ce/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/CreatePreAggregateTableCommand.scala ---------------------------------------------------------------------- diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/CreatePreAggregateTableCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/CreatePreAggregateTableCommand.scala index 46d885d..c861b00 100644 --- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/CreatePreAggregateTableCommand.scala +++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/CreatePreAggregateTableCommand.scala @@ -117,7 +117,7 @@ case class CreatePreAggregateTableCommand( CarbonEnv.getTablePath(tableModel.databaseNameOp, tableModel.tableName)(sparkSession) } CarbonCreateTableCommand(TableNewProcessor(tableModel), - tableModel.ifNotExistsSet, Some(tablePath)).run(sparkSession) + tableModel.ifNotExistsSet, Some(tablePath), isVisible = false).run(sparkSession) val table = CarbonEnv.getCarbonTable(tableIdentifier)(sparkSession) val tableInfo = table.getTableInfo http://git-wip-us.apache.org/repos/asf/carbondata/blob/001795ce/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonCreateTableCommand.scala ---------------------------------------------------------------------- diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonCreateTableCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonCreateTableCommand.scala index c4030d6..fc7c13a 100644 --- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonCreateTableCommand.scala +++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonCreateTableCommand.scala @@ -38,7 +38,8 @@ case class CarbonCreateTableCommand( tableInfo: TableInfo, ifNotExistsSet: Boolean = false, tableLocation: Option[String] = None, - createDSTable: Boolean = true) + createDSTable: Boolean = true, + isVisible: Boolean = true) extends MetadataCommand { override def processMetadata(sparkSession: SparkSession): Seq[Row] = { @@ -102,6 +103,9 @@ case class CarbonCreateTableCommand( } else { "" } + // isVisible property is added to hive table properties to differentiate between main + // table and datamaps(like preaggregate). It is false only for datamaps. This is added + // to improve the show tables performance when filtering the datamaps from main tables sparkSession.sql( s"""CREATE TABLE $dbName.$tableName |(${ rawSchema }) @@ -110,7 +114,8 @@ case class CarbonCreateTableCommand( | tableName "$tableName", | dbName "$dbName", | tablePath "$tablePath", - | path "$tablePath" + | path "$tablePath", + | isVisible "$isVisible" | $carbonSchemaString) | $partitionString """.stripMargin) http://git-wip-us.apache.org/repos/asf/carbondata/blob/001795ce/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonShowTablesCommand.scala ---------------------------------------------------------------------- diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonShowTablesCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonShowTablesCommand.scala index c2a91d8..e3c4e97 100644 --- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonShowTablesCommand.scala +++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/table/CarbonShowTablesCommand.scala @@ -39,44 +39,18 @@ private[sql] case class CarbonShowTablesCommand ( databaseName: Option[String], override def processMetadata(sparkSession: SparkSession): Seq[Row] = { // Since we need to return a Seq of rows, we will call getTables directly // instead of calling tables in sparkSession. - // filterDataMaps Method is to Filter the Table. val catalog = sparkSession.sessionState.catalog val db = databaseName.getOrElse(catalog.getCurrentDatabase) var tables = tableIdentifierPattern.map(catalog.listTables(db, _)).getOrElse(catalog.listTables(db)) - tables = filterDataMaps(tables, sparkSession) - tables.map { tableIdent => - val isTemp = catalog.isTemporaryTable(tableIdent) - Row(tableIdent.database.getOrElse("default"), tableIdent.table, isTemp) + val externalCatalog = sparkSession.sharedState.externalCatalog + // tables will be filtered for all the dataMaps to show only main tables + tables.collect { + case tableIdent if externalCatalog.getTable(db, tableIdent.table).storage.properties + .getOrElse("isVisible", true).toString.toBoolean => + val isTemp = catalog.isTemporaryTable(tableIdent) + Row(tableIdent.database.getOrElse("default"), tableIdent.table, isTemp) } } - /** - * - * @param tables tableIdnetifers - * @param sparkSession sparksession - * @return Tables after filter datamap tables - */ - private def filterDataMaps(tables: Seq[TableIdentifier], - sparkSession: SparkSession): Seq[TableIdentifier] = { - // Filter carbon Tables then get CarbonTable and getDataMap List and filter the same - // as of now 2 times lookup is happening(filter carbon table ,getDataMapList) - // TODO : add another PR (CARBONDATA-2103) to improve with 1 lookup - val allDatamapTable = tables.filter { table => - CarbonEnv.getInstance(sparkSession).carbonMetastore - .tableExists(table)(sparkSession) - }.map { table => - val ctable = CarbonEnv.getCarbonTable(table.database, table.table)(sparkSession) - ctable.getTableInfo.getDataMapSchemaList.asScala - } - val alldamrelation = allDatamapTable - .flatMap { table => - table.map(eachtable => eachtable.getRelationIdentifier.toString) - } - tables - .filter { table => - !alldamrelation - .contains(table.database.getOrElse("default") + "." + table.identifier) - } - } }
