This is an automated email from the ASF dual-hosted git repository.
fanng pushed a commit to branch branch-0.8
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/branch-0.8 by this push:
new 75d7d38d9c [#5361] improvment(hadoop-catalog): Introduce timeout
mechanism to get Hadoop File System. (#6221)
75d7d38d9c is described below
commit 75d7d38d9cde23a26cc92cc117201e8aa74bad94
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Jan 14 15:07:06 2025 +0800
[#5361] improvment(hadoop-catalog): Introduce timeout mechanism to get
Hadoop File System. (#6221)
### What changes were proposed in this pull request?
Introduce a timeout mechanism when getting a Hadoop FileSystem instance.
### Why are the changes needed?
Cloud filesystem like S3 and OSS(10 minutes) has a very long connection
and can't be tune by configuration, this will cause deadlock as it will
hold the tree lock for a long time
Fix: #5361
Fix: #6156
### Does this PR introduce _any_ user-facing change?
N/A.
### How was this patch tested?
Existing test.
Co-authored-by: Qi Yu <[email protected]>
---
LICENSE.bin | 1 +
catalogs/catalog-hadoop/build.gradle.kts | 1 +
.../catalog/hadoop/HadoopCatalogOperations.java | 35 +++++++++++++++++++++-
.../hadoop/HadoopCatalogPropertiesMetadata.java | 11 +++++++
.../org/apache/gravitino/lock/LockManager.java | 9 ++++--
docs/hadoop-catalog.md | 9 +++---
6 files changed, 59 insertions(+), 7 deletions(-)
diff --git a/LICENSE.bin b/LICENSE.bin
index effaa4ac4a..d1dddd5279 100644
--- a/LICENSE.bin
+++ b/LICENSE.bin
@@ -374,6 +374,7 @@
Apache Arrow
Rome
Jettison
+ Awaitility
This product bundles various third-party components also under the
Apache Software Foundation License 1.1
diff --git a/catalogs/catalog-hadoop/build.gradle.kts
b/catalogs/catalog-hadoop/build.gradle.kts
index d599a5e72f..3108d993c1 100644
--- a/catalogs/catalog-hadoop/build.gradle.kts
+++ b/catalogs/catalog-hadoop/build.gradle.kts
@@ -54,6 +54,7 @@ dependencies {
exclude("org.fusesource.leveldbjni")
}
implementation(libs.slf4j.api)
+ implementation(libs.awaitility)
compileOnly(libs.guava)
diff --git
a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java
b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java
index 36177bea37..6c032414be 100644
---
a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java
+++
b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogOperations.java
@@ -31,6 +31,8 @@ import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang3.StringUtils;
import org.apache.gravitino.Catalog;
import org.apache.gravitino.Entity;
@@ -71,6 +73,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.awaitility.Awaitility;
+import org.awaitility.core.ConditionTimeoutException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -755,6 +759,35 @@ public class HadoopCatalogOperations extends
ManagedSchemaOperations
scheme, path, fileSystemProvidersMap.keySet(),
fileSystemProvidersMap.values()));
}
- return provider.getFileSystem(path, config);
+ int timeoutSeconds =
+ (int)
+ propertiesMetadata
+ .catalogPropertiesMetadata()
+ .getOrDefault(
+ config,
HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS);
+ try {
+ AtomicReference<FileSystem> fileSystem = new AtomicReference<>();
+ Awaitility.await()
+ .atMost(timeoutSeconds, TimeUnit.SECONDS)
+ .until(
+ () -> {
+ fileSystem.set(provider.getFileSystem(path, config));
+ return true;
+ });
+ return fileSystem.get();
+ } catch (ConditionTimeoutException e) {
+ throw new IOException(
+ String.format(
+ "Failed to get FileSystem for path: %s, scheme: %s, provider:
%s, config: %s within %s "
+ + "seconds, please check the configuration or increase the "
+ + "file system connection timeout time by setting catalog
property: %s",
+ path,
+ scheme,
+ provider,
+ config,
+ timeoutSeconds,
+
HadoopCatalogPropertiesMetadata.FILESYSTEM_CONNECTION_TIMEOUT_SECONDS),
+ e);
+ }
}
}
diff --git
a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java
b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java
index 22cf0d5b2c..3bdc125efc 100644
---
a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java
+++
b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/HadoopCatalogPropertiesMetadata.java
@@ -53,6 +53,9 @@ public class HadoopCatalogPropertiesMetadata extends
BaseCatalogPropertiesMetada
*/
public static final String DEFAULT_FS_PROVIDER =
"default-filesystem-provider";
+ static final String FILESYSTEM_CONNECTION_TIMEOUT_SECONDS =
"filesystem-conn-timeout-secs";
+ static final int DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS = 6;
+
public static final String BUILTIN_LOCAL_FS_PROVIDER = "builtin-local";
public static final String BUILTIN_HDFS_FS_PROVIDER = "builtin-hdfs";
@@ -82,6 +85,14 @@ public class HadoopCatalogPropertiesMetadata extends
BaseCatalogPropertiesMetada
false /* immutable */,
BUILTIN_LOCAL_FS_PROVIDER, // please see
LocalFileSystemProvider#name()
false /* hidden */))
+ .put(
+ FILESYSTEM_CONNECTION_TIMEOUT_SECONDS,
+ PropertyEntry.integerOptionalPropertyEntry(
+ FILESYSTEM_CONNECTION_TIMEOUT_SECONDS,
+ "Timeout to wait for to create the Hadoop file system client
instance.",
+ false /* immutable */,
+ DEFAULT_GET_FILESYSTEM_TIMEOUT_SECONDS,
+ false /* hidden */))
// The following two are about authentication.
.putAll(KERBEROS_PROPERTY_ENTRIES)
.putAll(AuthenticationConfig.AUTHENTICATION_PROPERTY_ENTRIES)
diff --git a/core/src/main/java/org/apache/gravitino/lock/LockManager.java
b/core/src/main/java/org/apache/gravitino/lock/LockManager.java
index 222dee8daa..d52c858cc4 100644
--- a/core/src/main/java/org/apache/gravitino/lock/LockManager.java
+++ b/core/src/main/java/org/apache/gravitino/lock/LockManager.java
@@ -26,6 +26,7 @@ import static
org.apache.gravitino.Configs.TREE_LOCK_MIN_NODE_IN_MEMORY;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import java.text.SimpleDateFormat;
import java.util.List;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
@@ -136,10 +137,14 @@ public class LockManager {
// If the thread is holding the lock for more than 30 seconds,
we will log it.
if (System.currentTimeMillis() - ts > 30000) {
LOG.warn(
- "Dead lock detected for thread with identifier {} on node
{}, threads that holding the node: {} ",
+ "Thread with identifier {} holds the lock node {} for more
than 30s since {}, please "
+ + "check if some dead lock or thread hang like
io-connection hangs",
threadIdentifier,
node,
- node.getHoldingThreadTimestamp());
+ // SimpleDateFormat is not thread-safe, so we should
create a new instance for
+ // each time
+ new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+ .format(node.getHoldingThreadTimestamp()));
}
});
}
diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md
index 99e1dd7854..cbdae84689 100644
--- a/docs/hadoop-catalog.md
+++ b/docs/hadoop-catalog.md
@@ -23,10 +23,11 @@ Hadoop 3. If there's any compatibility issue, please create
an [issue](https://g
Besides the [common catalog
properties](./gravitino-server-config.md#apache-gravitino-catalog-properties-configuration),
the Hadoop catalog has the following properties:
-| Property Name | Description
| Default Value | Required | Since Version |
-|------------------------|----------------------------------------------------|---------------|----------|------------------|
-| `location` | The storage location managed by Hadoop catalog.
| (none) | No | 0.5.0 |
-| `credential-providers` | The credential provider types, separated by comma.
| (none) | No | 0.8.0-incubating |
+| Property Name | Description
| Default Value |
Required | Since Version |
+|--------------------------------|-----------------------------------------------------------------------------------------------------|---------------|----------|------------------|
+| `location` | The storage location managed by Hadoop
catalog. | (none) |
No | 0.5.0 |
+| `filesystem-conn-timeout-secs` | The timeout of getting the file system
using Hadoop FileSystem client instance. Time unit: seconds. | 6 |
No | 0.8.0-incubating |
+| `credential-providers` | The credential provider types, separated by
comma. | (none) | No
| 0.8.0-incubating |
Please refer to [Credential vending](./security/credential-vending.md) for
more details about credential vending.