This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git

commit c758a25dd818dcad3ec2a3119431f7b4e7c562d9
Author: Mingyu Chen <morning...@163.com>
AuthorDate: Thu Mar 28 23:36:16 2024 +0800

    [opt](fqdn) Add DNS Cache for FE and BE (#32869)
    
    In previously, when enabling FQDN, Doris will call dns resolver to get IP 
from hostname
    each time when 1) FE gets BE's grpc client. 2) BE gets other BE's brpc 
client.
    So when in high concurrency case, the dns resolver be overloaded and failed 
to resolve hostname.
    
    This PR mainly changes:
    
    1. Add DNSCache for both FE and BE.
        The DNSCache will run on every FE and BE node. It has a cache, key is 
hostname and value is IP.
        Caller can get IP by hostname from this cache, and if hostname does not 
exist, it will try to resolve it
        and update the cache.
        In addition, DNSCache has a daemon thread to refresh the cache every 1 
min, in case that the IP may
        be changed at anytime.
    
    There are other implements of this dns cache:
    
    1.  
https://github.com/kaka11chen/doris/commit/36fed139974ed52dfa61b656f3e4d64f56a4185a
        This is for BE side, but it does not handle the IP change case.
    
    3. https://github.com/apache/doris/pull/28479
        This is for FE side, but it can only work with Master FE. Other FE node 
will not be aware of the IP change.
        And there are a bunch of BackendServiceProxy, this PR only handle cache 
in one of them.
---
 be/src/common/config.cpp                           |  2 +
 be/src/common/config.h                             |  4 +
 be/src/runtime/client_cache.cpp                    |  2 +-
 be/src/runtime/exec_env.h                          |  4 +
 be/src/runtime/exec_env_init.cpp                   |  3 +
 be/src/util/brpc_client_cache.h                    |  4 +-
 be/src/util/dns_cache.cpp                          | 84 +++++++++++++++++++
 be/src/util/dns_cache.h                            | 57 +++++++++++++
 .../main/java/org/apache/doris/catalog/Env.java    | 10 +++
 .../java/org/apache/doris/common/DNSCache.java     | 95 ++++++++++++++++++++++
 .../org/apache/doris/common/util/NetUtils.java     | 16 +++-
 .../org/apache/doris/rpc/BackendServiceProxy.java  |  4 +-
 12 files changed, 278 insertions(+), 7 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index d577f5994d4..1d26085db39 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1188,6 +1188,8 @@ 
DEFINE_mInt32(table_sink_partition_write_max_partition_nums_per_writer, "128");
 /** Hive sink configurations **/
 DEFINE_mInt64(hive_sink_max_file_size, "1073741824"); // 1GB
 
+DEFINE_mInt32(thrift_client_open_num_tries, "1");
+
 // clang-format off
 #ifdef BE_TEST
 // test s3
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 951fd62f87f..5650db764fa 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1263,6 +1263,10 @@ 
DECLARE_mInt32(table_sink_partition_write_max_partition_nums_per_writer);
 /** Hive sink configurations **/
 DECLARE_mInt64(hive_sink_max_file_size); // 1GB
 
+// Number of open tries, default 1 means only try to open once.
+// Retry the Open num_retries time waiting 100 milliseconds between retries.
+DECLARE_mInt32(thrift_client_open_num_tries);
+
 #ifdef BE_TEST
 // test s3
 DECLARE_String(test_s3_resource);
diff --git a/be/src/runtime/client_cache.cpp b/be/src/runtime/client_cache.cpp
index 3da31caf5c8..ea7b43b6102 100644
--- a/be/src/runtime/client_cache.cpp
+++ b/be/src/runtime/client_cache.cpp
@@ -114,7 +114,7 @@ Status ClientCacheHelper::_create_client(const 
TNetworkAddress& hostport,
 
     client_impl->set_conn_timeout(config::thrift_connect_timeout_seconds * 
1000);
 
-    Status status = client_impl->open();
+    Status status = 
client_impl->open_with_retry(config::thrift_client_open_num_tries, 100);
 
     if (!status.ok()) {
         *client_key = nullptr;
diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h
index ea665f3f5a8..f8b2ecfa6ae 100644
--- a/be/src/runtime/exec_env.h
+++ b/be/src/runtime/exec_env.h
@@ -105,6 +105,7 @@ class RowCache;
 class DummyLRUCache;
 class CacheManager;
 class WalManager;
+class DNSCache;
 
 inline bool k_doris_exit = false;
 
@@ -214,6 +215,8 @@ public:
     FileMetaCache* file_meta_cache() { return _file_meta_cache; }
     MemTableMemoryLimiter* memtable_memory_limiter() { return 
_memtable_memory_limiter.get(); }
     WalManager* wal_mgr() { return _wal_manager.get(); }
+    DNSCache* dns_cache() { return _dns_cache; }
+
 #ifdef BE_TEST
     void set_ready() { this->_s_ready = true; }
     void set_not_ready() { this->_s_ready = false; }
@@ -363,6 +366,7 @@ private:
     std::unique_ptr<LoadStreamMapPool> _load_stream_map_pool;
     std::unique_ptr<vectorized::DeltaWriterV2Pool> _delta_writer_v2_pool;
     std::shared_ptr<WalManager> _wal_manager;
+    DNSCache* _dns_cache = nullptr;
 
     std::mutex _frontends_lock;
     // ip:brpc_port -> frontend_indo
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index 1da172716da..89040a342f1 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -87,6 +87,7 @@
 #include "util/brpc_client_cache.h"
 #include "util/cpu_info.h"
 #include "util/disk_info.h"
+#include "util/dns_cache.h"
 #include "util/doris_metrics.h"
 #include "util/mem_info.h"
 #include "util/metrics.h"
@@ -232,6 +233,7 @@ Status ExecEnv::_init(const std::vector<StorePath>& 
store_paths,
     _load_stream_map_pool = std::make_unique<LoadStreamMapPool>();
     _delta_writer_v2_pool = std::make_unique<vectorized::DeltaWriterV2Pool>();
     _wal_manager = WalManager::create_shared(this, 
config::group_commit_wal_path);
+    _dns_cache = new DNSCache();
     _spill_stream_mgr = new vectorized::SpillStreamManager(spill_store_paths);
 
     _backend_client_cache->init_metrics("backend");
@@ -554,6 +556,7 @@ void ExecEnv::destroy() {
     _delta_writer_v2_pool.reset();
     _load_stream_map_pool.reset();
     SAFE_STOP(_storage_engine);
+    SAFE_DELETE(_dns_cache);
     SAFE_STOP(_spill_stream_mgr);
     SAFE_SHUTDOWN(_buffered_reader_prefetch_thread_pool);
     SAFE_SHUTDOWN(_s3_file_upload_thread_pool);
diff --git a/be/src/util/brpc_client_cache.h b/be/src/util/brpc_client_cache.h
index ff537850854..7b313d6ae02 100644
--- a/be/src/util/brpc_client_cache.h
+++ b/be/src/util/brpc_client_cache.h
@@ -40,6 +40,8 @@
 
 #include "common/compiler_util.h" // IWYU pragma: keep
 #include "common/config.h"
+#include "runtime/exec_env.h"
+#include "util/dns_cache.h"
 #include "util/network_util.h"
 
 namespace doris {
@@ -79,7 +81,7 @@ public:
         std::string realhost;
         realhost = host;
         if (!is_valid_ip(host)) {
-            Status status = hostname_to_ip(host, realhost);
+            Status status = ExecEnv::GetInstance()->dns_cache()->get(host, 
&realhost);
             if (!status.ok()) {
                 LOG(WARNING) << "failed to get ip from host:" << 
status.to_string();
                 return nullptr;
diff --git a/be/src/util/dns_cache.cpp b/be/src/util/dns_cache.cpp
new file mode 100644
index 00000000000..f2bd4ce91e6
--- /dev/null
+++ b/be/src/util/dns_cache.cpp
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/dns_cache.h"
+
+#include "service/backend_options.h"
+#include "util/network_util.h"
+
+namespace doris {
+
+DNSCache::DNSCache() {
+    refresh_thread = std::thread(&DNSCache::_refresh_cache, this);
+    refresh_thread.detach();
+}
+
+DNSCache::~DNSCache() {
+    stop_refresh = true;
+    if (refresh_thread.joinable()) {
+        refresh_thread.join();
+    }
+}
+
+Status DNSCache::get(const std::string& hostname, std::string* ip) {
+    {
+        std::shared_lock<std::shared_mutex> lock(mutex);
+        auto it = cache.find(hostname);
+        if (it != cache.end()) {
+            *ip = it->second;
+            return Status::OK();
+        }
+    }
+    // Update if not found
+    RETURN_IF_ERROR(_update(hostname));
+    {
+        std::shared_lock<std::shared_mutex> lock(mutex);
+        *ip = cache[hostname];
+        return Status::OK();
+    }
+}
+
+Status DNSCache::_update(const std::string& hostname) {
+    std::string real_ip = "";
+    RETURN_IF_ERROR(hostname_to_ip(hostname, real_ip, 
BackendOptions::is_bind_ipv6()));
+    std::unique_lock<std::shared_mutex> lock(mutex);
+    auto it = cache.find(hostname);
+    if (it == cache.end() || it->second != real_ip) {
+        cache[hostname] = real_ip;
+        LOG(INFO) << "update hostname " << hostname << "'s ip to " << real_ip;
+    }
+    return Status::OK();
+}
+
+void DNSCache::_refresh_cache() {
+    while (!stop_refresh) {
+        // refresh every 1 min
+        std::this_thread::sleep_for(std::chrono::minutes(1));
+        std::unordered_set<std::string> keys;
+        {
+            std::shared_lock<std::shared_mutex> lock(mutex);
+            std::transform(cache.begin(), cache.end(), std::inserter(keys, 
keys.end()),
+                           [](const auto& pair) { return pair.first; });
+        }
+        Status st;
+        for (auto& key : keys) {
+            st = _update(key);
+        }
+    }
+}
+
+} // end of namespace doris
diff --git a/be/src/util/dns_cache.h b/be/src/util/dns_cache.h
new file mode 100644
index 00000000000..5dc413c53e2
--- /dev/null
+++ b/be/src/util/dns_cache.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <iostream>
+#include <shared_mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+#include "common/status.h"
+
+namespace doris {
+
+// Same as
+// fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java
+class DNSCache {
+public:
+    DNSCache();
+    ~DNSCache();
+
+    // get ip by hostname
+    Status get(const std::string& hostname, std::string* ip);
+
+private:
+    // update the ip of hostname in cache
+    Status _update(const std::string& hostname);
+
+    // a function for refresh daemon thread
+    // update cache at fix internal
+    void _refresh_cache();
+
+private:
+    // hostname -> ip
+    std::unordered_map<std::string, std::string> cache;
+    mutable std::shared_mutex mutex;
+    std::thread refresh_thread;
+    bool stop_refresh = false;
+};
+
+} // end of namespace doris
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
index 0f0eab8b672..dd6ec52bac0 100755
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
@@ -98,6 +98,7 @@ import org.apache.doris.common.ClientPool;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.ConfigBase;
 import org.apache.doris.common.ConfigException;
+import org.apache.doris.common.DNSCache;
 import org.apache.doris.common.DdlException;
 import org.apache.doris.common.ErrorCode;
 import org.apache.doris.common.ErrorReport;
@@ -526,6 +527,8 @@ public class Env {
 
     private InsertOverwriteManager insertOverwriteManager;
 
+    private DNSCache dnsCache;
+
     public List<TFrontendInfo> getFrontendInfos() {
         List<TFrontendInfo> res = new ArrayList<>();
 
@@ -760,6 +763,7 @@ public class Env {
                 "TopicPublisher", Config.publish_topic_info_interval_ms, 
systemInfo);
         this.mtmvService = new MTMVService();
         this.insertOverwriteManager = new InsertOverwriteManager();
+        this.dnsCache = new DNSCache();
     }
 
     public static void destroyCheckpoint() {
@@ -915,6 +919,10 @@ public class Env {
         return getCurrentEnv().getHiveTransactionMgr();
     }
 
+    public DNSCache getDnsCache() {
+        return dnsCache;
+    }
+
     // Use tryLock to avoid potential dead lock
     private boolean tryLock(boolean mustLock) {
         while (true) {
@@ -1685,6 +1693,8 @@ public class Env {
         if (Config.enable_hms_events_incremental_sync) {
             metastoreEventsProcessor.start();
         }
+
+        dnsCache.start();
     }
 
     private void transferToNonMaster(FrontendNodeType newType) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java
new file mode 100644
index 00000000000..1fe96eba20f
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.common;
+
+import org.apache.doris.common.util.NetUtils;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.net.UnknownHostException;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ScheduledExecutorService;
+
+/**
+ * DNSCache is a class that caches DNS lookups and periodically refreshes them.
+ * It uses a ConcurrentHashMap to store the hostname to IP address mappings 
and a ScheduledExecutorService
+ * to periodically refresh these mappings.
+ */
+public class DNSCache {
+    private static final Logger LOG = LogManager.getLogger(DNSCache.class);
+
+    private final ConcurrentHashMap<String, String> cache = new 
ConcurrentHashMap<>();
+    private final ScheduledExecutorService executor = 
ThreadPoolManager.newDaemonScheduledThreadPool(1,
+            "dns_cache_pool", true);
+
+    /**
+     * Check if the enable_fqdn_mode configuration is set.
+     * If it is, it schedules a task to refresh the DNS cache every 60 seconds,
+     * starting after an initial delay of 120 seconds.
+     */
+    public void start() {
+        if (Config.enable_fqdn_mode) {
+            executor.scheduleAtFixedRate(this::refresh, 120, 60, 
java.util.concurrent.TimeUnit.SECONDS);
+        }
+    }
+
+    /**
+     * The get method retrieves the IP address for a given hostname from the 
cache.
+     * If the hostname is not in the cache, it resolves the hostname to an IP 
address and stores it in the cache.
+     *
+     * @param hostname The hostname for which to get the IP address.
+     * @return The IP address for the given hostname.
+     */
+    public String get(String hostname) {
+        return cache.computeIfAbsent(hostname, this::resolveHostname);
+    }
+
+    /**
+     * The resolveHostname method resolves a hostname to an IP address.
+     * If the hostname cannot be resolved, it returns an empty string.
+     *
+     * @param hostname The hostname to resolve.
+     * @return The IP address for the given hostname, or an empty string if 
the hostname cannot be resolved.
+     */
+    private String resolveHostname(String hostname) {
+        try {
+            return NetUtils.getIpByHost(hostname, 0);
+        } catch (UnknownHostException e) {
+            return "";
+        }
+    }
+
+    /**
+     * The refresh method periodically refreshes the DNS cache.
+     * It iterates over each hostname in the cache, resolves the hostname to 
an IP address,
+     * and compares it with the current IP address in the cache.
+     * If they are different, it updates the cache with the new IP address and 
logs the change.
+     */
+    private void refresh() {
+        for (String hostname : cache.keySet()) {
+            String resolvedHostname = resolveHostname(hostname);
+            String currentHostname = cache.get(hostname);
+            if (!resolvedHostname.equals(currentHostname)) {
+                cache.put(hostname, resolvedHostname);
+                LOG.info("IP for hostname {} has changed from {} to {}", 
hostname, currentHostname,
+                        resolvedHostname);
+            }
+        }
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java
index 0c1ac130cde..9b787f52bf4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java
@@ -95,9 +95,19 @@ public class NetUtils {
         return hostName;
     }
 
-    public static String getIpByHost(String host) throws UnknownHostException {
-        InetAddress inetAddress = InetAddress.getByName(host);
-        return inetAddress.getHostAddress();
+    public static String getIpByHost(String host, int retryTimes) throws 
UnknownHostException {
+        InetAddress inetAddress;
+        while (true) {
+            try {
+                inetAddress = InetAddress.getByName(host);
+                return inetAddress.getHostAddress();
+            } catch (UnknownHostException e) {
+                LOG.warn("Get IP by host failed, hostname: {}, remaining 
retryTimes: {}", host, retryTimes, e);
+                if (retryTimes-- <= 0) {
+                    throw e;
+                }
+            }
+        }
     }
 
     // This is the implementation is inspired by Apache camel project:
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java 
b/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java
index d78e055a1ab..af21194263f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java
@@ -17,9 +17,9 @@
 
 package org.apache.doris.rpc;
 
+import org.apache.doris.catalog.Env;
 import org.apache.doris.common.Config;
 import org.apache.doris.common.ThreadPoolManager;
-import org.apache.doris.common.util.NetUtils;
 import org.apache.doris.metric.MetricRepo;
 import org.apache.doris.planner.PlanFragmentId;
 import org.apache.doris.proto.InternalService;
@@ -112,7 +112,7 @@ public class BackendServiceProxy {
     }
 
     private BackendServiceClient getProxy(TNetworkAddress address) throws 
UnknownHostException {
-        String realIp = NetUtils.getIpByHost(address.getHostname());
+        String realIp = 
Env.getCurrentEnv().getDnsCache().get(address.hostname);
         BackendServiceClientExtIp serviceClientExtIp = serviceMap.get(address);
         if (serviceClientExtIp != null && 
serviceClientExtIp.realIp.equals(realIp)
                 && serviceClientExtIp.client.isNormalState()) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to