This is an automated email from the ASF dual-hosted git repository.

jshao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git


The following commit(s) were added to refs/heads/main by this push:
     new 6d05ec4b8 [#4089] fix(hive catalog): the problem of slow acquisition 
of hive table list (#4469)
6d05ec4b8 is described below

commit 6d05ec4b8bf278c2dd7a36af35589f07b220fcab
Author: mygrsun <[email protected]>
AuthorDate: Mon Nov 4 21:08:31 2024 +0800

    [#4089] fix(hive catalog): the problem of slow acquisition of hive table 
list (#4469)
    
    ### What changes were proposed in this pull request?
    
    the problem of slow acquisition of hive table list.
    Using listTableNamesByFilter replace the getTableObjectsByName method.
    
    
    ### Why are the changes needed?
    
    I found that list-table will takes 300s when a schema has 5000 tables .
    
    Fix: #4089
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    Manual testing
    
    ---------
    
    Co-authored-by: ericqin <[email protected]>
    Co-authored-by: mchades <[email protected]>
---
 .../catalog/hive/HiveCatalogOperations.java        | 77 +++++++++++++---------
 .../gravitino/catalog/hive/TestHiveTable.java      | 35 +++++++++-
 .../hive/integration/test/CatalogHiveIT.java       | 20 ++++++
 docs/apache-hive-catalog.md                        |  9 ++-
 4 files changed, 106 insertions(+), 35 deletions(-)

diff --git 
a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
 
b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
index 6de028f41..bb7d06f6b 100644
--- 
a/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
+++ 
b/catalogs/catalog-hive/src/main/java/org/apache/gravitino/catalog/hive/HiveCatalogOperations.java
@@ -21,9 +21,7 @@ package org.apache.gravitino.catalog.hive;
 import static 
org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.LIST_ALL_TABLES;
 import static 
org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.METASTORE_URIS;
 import static 
org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.PRINCIPAL;
-import static 
org.apache.gravitino.catalog.hive.HiveTable.ICEBERG_TABLE_TYPE_VALUE;
 import static org.apache.gravitino.catalog.hive.HiveTable.SUPPORT_TABLE_TYPES;
-import static org.apache.gravitino.catalog.hive.HiveTable.TABLE_TYPE_PROP;
 import static 
org.apache.gravitino.catalog.hive.HiveTablePropertiesMetadata.COMMENT;
 import static 
org.apache.gravitino.catalog.hive.HiveTablePropertiesMetadata.TABLE_TYPE;
 import static org.apache.gravitino.connector.BaseCatalog.CATALOG_BYPASS_PREFIX;
@@ -93,6 +91,7 @@ import 
org.apache.hadoop.hive.metastore.api.InvalidOperationException;
 import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.metastore.api.UnknownDBException;
+import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
 import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.thrift.TException;
@@ -116,7 +115,10 @@ public class HiveCatalogOperations implements 
CatalogOperations, SupportsSchemas
   private ScheduledThreadPoolExecutor checkTgtExecutor;
   private String kerberosRealm;
   private ProxyPlugin proxyPlugin;
-  boolean listAllTables = true;
+  private boolean listAllTables = true;
+  // The maximum number of tables that can be returned by the 
listTableNamesByFilter function.
+  // The default value is -1, which means that all tables are returned.
+  private static final short MAX_TABLES = -1;
 
   // Map that maintains the mapping of keys in Gravitino to that in Hive, for 
example, users
   // will only need to set the configuration 'METASTORE_URL' in Gravitino and 
Gravitino will change
@@ -539,23 +541,32 @@ public class HiveCatalogOperations implements 
CatalogOperations, SupportsSchemas
       // then based on
       // those names we can obtain metadata for each individual table and get 
the type we needed.
       List<String> allTables = clientPool.run(c -> 
c.getAllTables(schemaIdent.name()));
-      return clientPool.run(
-          c ->
-              c.getTableObjectsByName(schemaIdent.name(), allTables).stream()
-                  .filter(
-                      tb -> {
-                        boolean isSupportTable = 
SUPPORT_TABLE_TYPES.contains(tb.getTableType());
-                        if (!isSupportTable) {
-                          return false;
-                        }
-                        if (!listAllTables) {
-                          Map<String, String> parameters = tb.getParameters();
-                          return isHiveTable(parameters);
-                        }
-                        return true;
-                      })
-                  .map(tb -> NameIdentifier.of(namespace, tb.getTableName()))
-                  .toArray(NameIdentifier[]::new));
+      if (!listAllTables) {
+        // The reason for using the listTableNamesByFilter function is that the
+        // getTableObjectiesByName function has poor performance. Currently, 
we focus on the
+        // Iceberg, Paimon and Hudi table. In the future, if necessary, we 
will need to filter out
+        // other tables. In addition, the current return also includes tables 
of type VIRTUAL-VIEW.
+        String icebergAndPaimonFilter = getIcebergAndPaimonFilter();
+        List<String> icebergAndPaimonTables =
+            clientPool.run(
+                c ->
+                    c.listTableNamesByFilter(
+                        schemaIdent.name(), icebergAndPaimonFilter, 
MAX_TABLES));
+        allTables.removeAll(icebergAndPaimonTables);
+
+        // filter out the Hudi tables
+        String hudiFilter =
+            String.format(
+                "%sprovider like \"hudi\"", 
hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
+        List<String> hudiTables =
+            clientPool.run(
+                c -> c.listTableNamesByFilter(schemaIdent.name(), hudiFilter, 
MAX_TABLES));
+        removeHudiTables(allTables, hudiTables);
+      }
+      return allTables.stream()
+          .map(tbName -> NameIdentifier.of(namespace, tbName))
+          .toArray(NameIdentifier[]::new);
+
     } catch (UnknownDBException e) {
       throw new NoSuchSchemaException(
           "Schema (database) does not exist %s in Hive Metastore", namespace);
@@ -569,20 +580,24 @@ public class HiveCatalogOperations implements 
CatalogOperations, SupportsSchemas
     }
   }
 
-  boolean isHiveTable(Map<String, String> tableParameters) {
-    if (isIcebergTable(tableParameters)) return false;
-    return true;
+  private static String getIcebergAndPaimonFilter() {
+    String icebergFilter =
+        String.format(
+            "%stable_type like \"ICEBERG\"", 
hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
+    String paimonFilter =
+        String.format(
+            "%stable_type like \"PAIMON\"", 
hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
+    return String.format("%s or %s", icebergFilter, paimonFilter);
   }
 
-  boolean isIcebergTable(Map<String, String> tableParameters) {
-    if (tableParameters != null) {
-      boolean isIcebergTable =
-          
ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(tableParameters.get(TABLE_TYPE_PROP));
-      if (isIcebergTable) {
-        return true;
-      }
+  private void removeHudiTables(List<String> allTables, List<String> 
hudiTables) {
+    for (String hudiTable : hudiTables) {
+      allTables.removeIf(
+          t ->
+              t.equals(hudiTable)
+                  || t.startsWith(hudiTable + "_ro")
+                  || t.startsWith(hudiTable + "_rt"));
     }
-    return false;
   }
 
   /**
diff --git 
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
 
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
index 9b9e0c8a8..2823bf276 100644
--- 
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
+++ 
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/TestHiveTable.java
@@ -25,6 +25,7 @@ import static 
org.apache.gravitino.rel.expressions.transforms.Transforms.day;
 import static 
org.apache.gravitino.rel.expressions.transforms.Transforms.identity;
 import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE;
 
+import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Maps;
 import java.time.Instant;
 import java.util.Arrays;
@@ -70,14 +71,14 @@ public class TestHiveTable extends MiniHiveMetastoreService 
{
       NameIdentifier.of(META_LAKE_NAME, HIVE_CATALOG_NAME, HIVE_SCHEMA_NAME);
 
   @BeforeAll
-  private static void setup() {
+  public static void setup() {
     hiveCatalog = initHiveCatalog();
     hiveCatalogOperations = (HiveCatalogOperations) hiveCatalog.ops();
     hiveSchema = initHiveSchema();
   }
 
   @AfterEach
-  private void resetSchema() {
+  public void resetSchema() {
     hiveCatalogOperations.dropSchema(schemaIdent, true);
     hiveSchema = initHiveSchema();
   }
@@ -343,7 +344,35 @@ public class TestHiveTable extends 
MiniHiveMetastoreService {
   }
 
   @Test
-  public void testListTableException() {
+  public void testListTable() {
+    // mock iceberg table and hudi table
+    NameIdentifier icebergTableIdent =
+        NameIdentifier.of(META_LAKE_NAME, hiveCatalog.name(), 
hiveSchema.name(), "iceberg_table");
+    NameIdentifier hudiTableIdent =
+        NameIdentifier.of(META_LAKE_NAME, hiveCatalog.name(), 
hiveSchema.name(), "hudi_table");
+
+    hiveCatalogOperations.createTable(
+        icebergTableIdent,
+        new Column[] {
+          
HiveColumn.builder().withName("col_1").withType(Types.ByteType.get()).build()
+        },
+        HIVE_COMMENT,
+        ImmutableMap.of("table_type", "ICEBERG"));
+    hiveCatalogOperations.createTable(
+        hudiTableIdent,
+        new Column[] {
+          
HiveColumn.builder().withName("col_1").withType(Types.ByteType.get()).build()
+        },
+        HIVE_COMMENT,
+        ImmutableMap.of("provider", "hudi"));
+
+    // test list table
+    NameIdentifier[] tableIdents =
+        hiveCatalogOperations.listTables(
+            Namespace.of("metalake", hiveCatalog.name(), hiveSchema.name()));
+    Assertions.assertEquals(0, tableIdents.length);
+
+    // test exception
     Namespace tableNs = Namespace.of("metalake", hiveCatalog.name(), 
"not_exist_db");
     TableCatalog tableCatalog = hiveCatalogOperations;
     Throwable exception =
diff --git 
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
 
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
index 903f3fe8a..16ca3b57f 100644
--- 
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
+++ 
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveIT.java
@@ -52,6 +52,7 @@ import org.apache.gravitino.Catalog;
 import org.apache.gravitino.CatalogChange;
 import org.apache.gravitino.MetalakeChange;
 import org.apache.gravitino.NameIdentifier;
+import org.apache.gravitino.Namespace;
 import org.apache.gravitino.Schema;
 import org.apache.gravitino.SchemaChange;
 import org.apache.gravitino.SupportsSchemas;
@@ -613,6 +614,25 @@ public class CatalogHiveIT extends BaseIT {
     Assertions.assertTrue(exception.getMessage().contains("cannot be set"));
   }
 
+  @Test
+  public void testListTables() {
+    // mock iceberg, paimon, and hudi tables
+    NameIdentifier icebergTable = NameIdentifier.of(schemaName, 
"iceberg_table");
+    NameIdentifier paimonTable = NameIdentifier.of(schemaName, "paimon_table");
+    NameIdentifier hudiTable = NameIdentifier.of(schemaName, "hudi_table");
+    catalog
+        .asTableCatalog()
+        .createTable(icebergTable, createColumns(), null, 
ImmutableMap.of("table_type", "ICEBERG"));
+    catalog
+        .asTableCatalog()
+        .createTable(paimonTable, createColumns(), null, 
ImmutableMap.of("table_type", "PAIMON"));
+    catalog
+        .asTableCatalog()
+        .createTable(hudiTable, createColumns(), null, 
ImmutableMap.of("provider", "hudi"));
+    NameIdentifier[] tables = 
catalog.asTableCatalog().listTables(Namespace.of(schemaName));
+    Assertions.assertEquals(0, tables.length);
+  }
+
   @Test
   public void testHiveSchemaProperties() throws TException, 
InterruptedException {
     // test LOCATION property
diff --git a/docs/apache-hive-catalog.md b/docs/apache-hive-catalog.md
index fc303e9e4..c789059c0 100644
--- a/docs/apache-hive-catalog.md
+++ b/docs/apache-hive-catalog.md
@@ -41,7 +41,14 @@ Besides the [common catalog 
properties](./gravitino-server-config.md#gravitino-c
 | `kerberos.keytab-uri`                    | The uri of key tab for the 
catalog. Now supported protocols are `https`, `http`, `ftp`, `file`.            
                                                                                
                                                         | (none)        | 
required if you use kerberos | 0.4.0         |
 | `kerberos.check-interval-sec`            | The interval to check validness 
of the principal                                                                
                                                                                
                                                    | 60            | No        
                   | 0.4.0         |
 | `kerberos.keytab-fetch-timeout-sec`      | The timeout to fetch key tab      
                                                                                
                                                                                
                                                  | 60            | No          
                 | 0.4.0         |
-| `list-all-tables`                        | Lists all tables in a database, 
including non-Hive tables, such as Iceberg, etc                                 
                                                                                
                                                    | false         | No        
                   | 0.5.1         |
+| `list-all-tables`                        | Lists all tables in a database, 
including non-Hive tables, such as Iceberg, Hudi, etc.                          
                                                                                
                                                    | false         | No        
                   | 0.5.1         |
+
+:::note
+For `list-all-tables=false`, the Hive catalog will filter out:
+- Iceberg tables by table property `table_type=ICEBERG`
+- Paimon tables by table property `table_type=PAINMON`
+- Hudi tables by table property `provider=hudi`
+:::
 
 When you use the Gravitino with Trino. You can pass the Trino Hive connector 
configuration using prefix `trino.bypass.`. For example, using 
`trino.bypass.hive.config.resources` to pass the `hive.config.resources` to the 
Gravitino Hive catalog in Trino runtime.
 

Reply via email to