This is an automated email from the ASF dual-hosted git repository.
jshao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/main by this push:
new 6d05ec4b8 [#4089] fix(hive catalog): the problem of slow acquisition
of hive table list (#4469)
6d05ec4b8 is described below
commit 6d05ec4b8bf278c2dd7a36af35589f07b220fcab
Author: mygrsun <[email protected]>
AuthorDate: Mon Nov 4 21:08:31 2024 +0800
[#4089] fix(hive catalog): the problem of slow acquisition of hive table
list (#4469)
### What changes were proposed in this pull request?
the problem of slow acquisition of hive table list.
Using listTableNamesByFilter replace the getTableObjectsByName method.
### Why are the changes needed?
I found that list-table will takes 300s when a schema has 5000 tables .
Fix: #4089
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
Manual testing
---------
Co-authored-by: ericqin <[email protected]>
Co-authored-by: mchades <[email protected]>
---
.../catalog/hive/HiveCatalogOperations.java | 77 +++++++++++++---------
.../gravitino/catalog/hive/TestHiveTable.java | 35 +++++++++-
.../hive/integration/test/CatalogHiveIT.java | 20 ++++++
docs/apache-hive-catalog.md | 9 ++-
4 files changed, 106 insertions(+), 35 deletions(-)
diff --git
a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
index 6de028f41..bb7d06f6b 100644
---
a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
+++
b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
@@ -21,9 +21,7 @@ package org.apache.gravitino.catalog.hive;
import static
org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.LIST_ALL_TABLES;
import static
org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.METASTORE_URIS;
import static
org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.PRINCIPAL;
-import static
org.apache.gravitino.catalog.hive.HiveTable.ICEBERG_TABLE_TYPE_VALUE;
import static org.apache.gravitino.catalog.hive.HiveTable.SUPPORT_TABLE_TYPES;
-import static org.apache.gravitino.catalog.hive.HiveTable.TABLE_TYPE_PROP;
import static
org.apache.gravitino.catalog.hive.HiveTablePropertiesMetadata.COMMENT;
import static
org.apache.gravitino.catalog.hive.HiveTablePropertiesMetadata.TABLE_TYPE;
import static org.apache.gravitino.connector.BaseCatalog.CATALOG_BYPASS_PREFIX;
@@ -93,6 +91,7 @@ import
org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.UnknownDBException;
+import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;
@@ -116,7 +115,10 @@ public class HiveCatalogOperations implements
CatalogOperations, SupportsSchemas
private ScheduledThreadPoolExecutor checkTgtExecutor;
private String kerberosRealm;
private ProxyPlugin proxyPlugin;
- boolean listAllTables = true;
+ private boolean listAllTables = true;
+ // The maximum number of tables that can be returned by the
listTableNamesByFilter function.
+ // The default value is -1, which means that all tables are returned.
+ private static final short MAX_TABLES = -1;
// Map that maintains the mapping of keys in Gravitino to that in Hive, for
example, users
// will only need to set the configuration 'METASTORE_URL' in Gravitino and
Gravitino will change
@@ -539,23 +541,32 @@ public class HiveCatalogOperations implements
CatalogOperations, SupportsSchemas
// then based on
// those names we can obtain metadata for each individual table and get
the type we needed.
List<String> allTables = clientPool.run(c ->
c.getAllTables(schemaIdent.name()));
- return clientPool.run(
- c ->
- c.getTableObjectsByName(schemaIdent.name(), allTables).stream()
- .filter(
- tb -> {
- boolean isSupportTable =
SUPPORT_TABLE_TYPES.contains(tb.getTableType());
- if (!isSupportTable) {
- return false;
- }
- if (!listAllTables) {
- Map<String, String> parameters = tb.getParameters();
- return isHiveTable(parameters);
- }
- return true;
- })
- .map(tb -> NameIdentifier.of(namespace, tb.getTableName()))
- .toArray(NameIdentifier[]::new));
+ if (!listAllTables) {
+ // The reason for using the listTableNamesByFilter function is that the
+ // getTableObjectiesByName function has poor performance. Currently,
we focus on the
+ // Iceberg, Paimon and Hudi table. In the future, if necessary, we
will need to filter out
+ // other tables. In addition, the current return also includes tables
of type VIRTUAL-VIEW.
+ String icebergAndPaimonFilter = getIcebergAndPaimonFilter();
+ List<String> icebergAndPaimonTables =
+ clientPool.run(
+ c ->
+ c.listTableNamesByFilter(
+ schemaIdent.name(), icebergAndPaimonFilter,
MAX_TABLES));
+ allTables.removeAll(icebergAndPaimonTables);
+
+ // filter out the Hudi tables
+ String hudiFilter =
+ String.format(
+ "%sprovider like \"hudi\"",
hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
+ List<String> hudiTables =
+ clientPool.run(
+ c -> c.listTableNamesByFilter(schemaIdent.name(), hudiFilter,
MAX_TABLES));
+ removeHudiTables(allTables, hudiTables);
+ }
+ return allTables.stream()
+ .map(tbName -> NameIdentifier.of(namespace, tbName))
+ .toArray(NameIdentifier[]::new);
+
} catch (UnknownDBException e) {
throw new NoSuchSchemaException(
"Schema (database) does not exist %s in Hive Metastore", namespace);
@@ -569,20 +580,24 @@ public class HiveCatalogOperations implements
CatalogOperations, SupportsSchemas
}
}
- boolean isHiveTable(Map<String, String> tableParameters) {
- if (isIcebergTable(tableParameters)) return false;
- return true;
+ private static String getIcebergAndPaimonFilter() {
+ String icebergFilter =
+ String.format(
+ "%stable_type like \"ICEBERG\"",
hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
+ String paimonFilter =
+ String.format(
+ "%stable_type like \"PAIMON\"",
hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
+ return String.format("%s or %s", icebergFilter, paimonFilter);
}
- boolean isIcebergTable(Map<String, String> tableParameters) {
- if (tableParameters != null) {
- boolean isIcebergTable =
-
ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(tableParameters.get(TABLE_TYPE_PROP));
- if (isIcebergTable) {
- return true;
- }
+ private void removeHudiTables(List<String> allTables, List<String>
hudiTables) {
+ for (String hudiTable : hudiTables) {
+ allTables.removeIf(
+ t ->
+ t.equals(hudiTable)
+ || t.startsWith(hudiTable + "_ro")
+ || t.startsWith(hudiTable + "_rt"));
}
- return false;
}
/**
diff --git
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
index 9b9e0c8a8..2823bf276 100644
---
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
+++
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
@@ -25,6 +25,7 @@ import static
org.apache.gravitino.rel.expressions.transforms.Transforms.day;
import static
org.apache.gravitino.rel.expressions.transforms.Transforms.identity;
import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE;
+import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import java.time.Instant;
import java.util.Arrays;
@@ -70,14 +71,14 @@ public class TestHiveTable extends MiniHiveMetastoreService
{
NameIdentifier.of(META_LAKE_NAME, HIVE_CATALOG_NAME, HIVE_SCHEMA_NAME);
@BeforeAll
- private static void setup() {
+ public static void setup() {
hiveCatalog = initHiveCatalog();
hiveCatalogOperations = (HiveCatalogOperations) hiveCatalog.ops();
hiveSchema = initHiveSchema();
}
@AfterEach
- private void resetSchema() {
+ public void resetSchema() {
hiveCatalogOperations.dropSchema(schemaIdent, true);
hiveSchema = initHiveSchema();
}
@@ -343,7 +344,35 @@ public class TestHiveTable extends
MiniHiveMetastoreService {
}
@Test
- public void testListTableException() {
+ public void testListTable() {
+ // mock iceberg table and hudi table
+ NameIdentifier icebergTableIdent =
+ NameIdentifier.of(META_LAKE_NAME, hiveCatalog.name(),
hiveSchema.name(), "iceberg_table");
+ NameIdentifier hudiTableIdent =
+ NameIdentifier.of(META_LAKE_NAME, hiveCatalog.name(),
hiveSchema.name(), "hudi_table");
+
+ hiveCatalogOperations.createTable(
+ icebergTableIdent,
+ new Column[] {
+
HiveColumn.builder().withName("col_1").withType(Types.ByteType.get()).build()
+ },
+ HIVE_COMMENT,
+ ImmutableMap.of("table_type", "ICEBERG"));
+ hiveCatalogOperations.createTable(
+ hudiTableIdent,
+ new Column[] {
+
HiveColumn.builder().withName("col_1").withType(Types.ByteType.get()).build()
+ },
+ HIVE_COMMENT,
+ ImmutableMap.of("provider", "hudi"));
+
+ // test list table
+ NameIdentifier[] tableIdents =
+ hiveCatalogOperations.listTables(
+ Namespace.of("metalake", hiveCatalog.name(), hiveSchema.name()));
+ Assertions.assertEquals(0, tableIdents.length);
+
+ // test exception
Namespace tableNs = Namespace.of("metalake", hiveCatalog.name(),
"not_exist_db");
TableCatalog tableCatalog = hiveCatalogOperations;
Throwable exception =
diff --git
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
index 903f3fe8a..16ca3b57f 100644
---
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
+++
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
@@ -52,6 +52,7 @@ import org.apache.gravitino.Catalog;
import org.apache.gravitino.CatalogChange;
import org.apache.gravitino.MetalakeChange;
import org.apache.gravitino.NameIdentifier;
+import org.apache.gravitino.Namespace;
import org.apache.gravitino.Schema;
import org.apache.gravitino.SchemaChange;
import org.apache.gravitino.SupportsSchemas;
@@ -613,6 +614,25 @@ public class CatalogHiveIT extends BaseIT {
Assertions.assertTrue(exception.getMessage().contains("cannot be set"));
}
+ @Test
+ public void testListTables() {
+ // mock iceberg, paimon, and hudi tables
+ NameIdentifier icebergTable = NameIdentifier.of(schemaName,
"iceberg_table");
+ NameIdentifier paimonTable = NameIdentifier.of(schemaName, "paimon_table");
+ NameIdentifier hudiTable = NameIdentifier.of(schemaName, "hudi_table");
+ catalog
+ .asTableCatalog()
+ .createTable(icebergTable, createColumns(), null,
ImmutableMap.of("table_type", "ICEBERG"));
+ catalog
+ .asTableCatalog()
+ .createTable(paimonTable, createColumns(), null,
ImmutableMap.of("table_type", "PAIMON"));
+ catalog
+ .asTableCatalog()
+ .createTable(hudiTable, createColumns(), null,
ImmutableMap.of("provider", "hudi"));
+ NameIdentifier[] tables =
catalog.asTableCatalog().listTables(Namespace.of(schemaName));
+ Assertions.assertEquals(0, tables.length);
+ }
+
@Test
public void testHiveSchemaProperties() throws TException,
InterruptedException {
// test LOCATION property
diff --git a/docs/apache-hive-catalog.md b/docs/apache-hive-catalog.md
index fc303e9e4..c789059c0 100644
--- a/docs/apache-hive-catalog.md
+++ b/docs/apache-hive-catalog.md
@@ -41,7 +41,14 @@ Besides the [common catalog
properties](./gravitino-server-config.md#gravitino-c
| `kerberos.keytab-uri` | The uri of key tab for the
catalog. Now supported protocols are `https`, `http`, `ftp`, `file`.
| (none) |
required if you use kerberos | 0.4.0 |
| `kerberos.check-interval-sec` | The interval to check validness
of the principal
| 60 | No
| 0.4.0 |
| `kerberos.keytab-fetch-timeout-sec` | The timeout to fetch key tab
| 60 | No
| 0.4.0 |
-| `list-all-tables` | Lists all tables in a database,
including non-Hive tables, such as Iceberg, etc
| false | No
| 0.5.1 |
+| `list-all-tables` | Lists all tables in a database,
including non-Hive tables, such as Iceberg, Hudi, etc.
| false | No
| 0.5.1 |
+
+:::note
+For `list-all-tables=false`, the Hive catalog will filter out:
+- Iceberg tables by table property `table_type=ICEBERG`
+- Paimon tables by table property `table_type=PAINMON`
+- Hudi tables by table property `provider=hudi`
+:::
When you use the Gravitino with Trino. You can pass the Trino Hive connector
configuration using prefix `trino.bypass.`. For example, using
`trino.bypass.hive.config.resources` to pass the `hive.config.resources` to the
Gravitino Hive catalog in Trino runtime.