This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new e1cb7ca86d5 branch-3.0: [Fix](recycler) Fix recycler Hdfs accessor 
list all bug #46393 (#46918)
e1cb7ca86d5 is described below

commit e1cb7ca86d5599c871931595fe4e53d37c8919af
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Jan 13 21:10:54 2025 +0800

    branch-3.0: [Fix](recycler) Fix recycler Hdfs accessor list all bug #46393 
(#46918)
    
    Cherry-picked from #46393
    
    Co-authored-by: abmdocrt <[email protected]>
---
 cloud/src/recycler/hdfs_accessor.cpp |  47 +++++++-
 cloud/test/hdfs_accessor_test.cpp    | 200 +++++++++++++++++++++++++----------
 2 files changed, 193 insertions(+), 54 deletions(-)

diff --git a/cloud/src/recycler/hdfs_accessor.cpp 
b/cloud/src/recycler/hdfs_accessor.cpp
index 97a4670d2bf..1999bcfa165 100644
--- a/cloud/src/recycler/hdfs_accessor.cpp
+++ b/cloud/src/recycler/hdfs_accessor.cpp
@@ -342,6 +342,19 @@ std::string HdfsAccessor::to_uri(const std::string& 
relative_path) {
     return uri_ + '/' + relative_path;
 }
 
+// extract parent path from prefix
+// e.g.
+// data/492211/02000000008a012957476a3e174dfdaa71ee5f80a3abafa3_ -> 
data/492211/
+std::string extract_parent_path(const std::string& path) {
+    // Find the last '/'
+    size_t last_slash = path.find_last_of('/');
+    if (last_slash == std::string::npos) {
+        LOG_WARNING("no '/' found in path").tag("path", path);
+        return "";
+    }
+    return path.substr(0, last_slash + 1);
+}
+
 int HdfsAccessor::init() {
     // TODO(plat1ko): Cache hdfsFS
     fs_ = HDFSBuilder::create_fs(info_.build_conf());
@@ -356,8 +369,35 @@ int HdfsAccessor::init() {
 int HdfsAccessor::delete_prefix(const std::string& path_prefix, int64_t 
expiration_time) {
     auto uri = to_uri(path_prefix);
     LOG(INFO) << "delete prefix, uri=" << uri;
+    // If path prefix exists, assume it is a dir or a file.
+    if (exists(path_prefix) == 0) {
+        // If it exists, then it is a dir or a file.
+        // delete_directory func can delete a dir or a file.
+        if (delete_directory(path_prefix) == 0) {
+            LOG(INFO) << "delete prefix succ"
+                      << ", is dir or file = true"
+                      << ", uri=" << uri;
+            return 0;
+        }
+        // delete failed, return err
+        LOG_WARNING("delete prefix failed, this is a dir or a file")
+                .tag("path prefix", path_prefix);
+        return -1;
+    }
+    // If path prefix is not a dir or a file,
+    // for example: 
data/492211/02000000008a012957476a3e174dfdaa71ee5f80a3abafa3_.
+    // Then we need to extract the parent id path from the given prefix,
+    // traverse all files in the parent id path, and delete the files that 
match the prefix.
     std::unique_ptr<ListIterator> list_iter;
-    int ret = list_all(&list_iter);
+    auto parent_path = extract_parent_path(path_prefix);
+    if (parent_path.empty()) {
+        LOG_WARNING("extract parent path failed").tag("path prefix", 
path_prefix);
+        return -1;
+    }
+    LOG_INFO("path prefix is not a dir, extract parent path success")
+            .tag("path prefix", path_prefix)
+            .tag("parent path", parent_path);
+    int ret = list_directory(parent_path, &list_iter);
     if (ret != 0) {
         LOG(WARNING) << "delete prefix, failed to list" << uri;
         return ret;
@@ -372,6 +412,10 @@ int HdfsAccessor::delete_prefix(const std::string& 
path_prefix, int64_t expirati
         }
         ++num_deleted;
     }
+    if (num_deleted == 0) {
+        LOG_WARNING("recycler delete prefix num = 0, maybe there are some 
problems?")
+                .tag("path prefix", path_prefix);
+    }
     LOG(INFO) << "delete prefix " << (ret != 0 ? "failed" : "succ") << " ret=" 
<< ret
               << " uri=" << uri << " num_listed=" << num_listed << " 
num_deleted=" << num_deleted;
     return ret;
@@ -382,6 +426,7 @@ int HdfsAccessor::delete_directory_impl(const std::string& 
dir_path) {
     // `hdfsDelete`'s return value or errno to avoid exist rpc?
     int ret = exists(dir_path);
     if (ret == 1) {
+        // dir does not exist
         return 0;
     } else if (ret < 0) {
         return ret;
diff --git a/cloud/test/hdfs_accessor_test.cpp 
b/cloud/test/hdfs_accessor_test.cpp
index cabd07f8c9a..11c0af3853b 100644
--- a/cloud/test/hdfs_accessor_test.cpp
+++ b/cloud/test/hdfs_accessor_test.cpp
@@ -22,6 +22,7 @@
 #include <gtest/gtest.h>
 
 #include <iostream>
+#include <vector>
 
 #include "common/config.h"
 #include "common/logging.h"
@@ -59,20 +60,20 @@ TEST(HdfsAccessorTest, normal) {
 
     HdfsAccessor accessor(info);
     int ret = accessor.init();
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
 
     std::string file1 = "data/10000/1_0.dat";
 
     ret = accessor.delete_directory("");
-    ASSERT_NE(ret, 0);
+    EXPECT_NE(ret, 0);
     ret = accessor.delete_all();
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
 
     ret = accessor.put_file(file1, "");
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
 
     ret = accessor.exists(file1);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
 
     auto* sp = SyncPoint::get_instance();
     sp->enable_processing();
@@ -90,46 +91,46 @@ TEST(HdfsAccessorTest, normal) {
 
     std::unique_ptr<ListIterator> iter;
     ret = accessor.list_directory("data", &iter);
-    ASSERT_EQ(ret, 0);
-    ASSERT_TRUE(iter);
-    ASSERT_TRUE(iter->is_valid());
-    ASSERT_TRUE(iter->has_next());
-    ASSERT_EQ(iter->next()->path, file1);
-    ASSERT_FALSE(iter->has_next());
+    EXPECT_EQ(ret, 0);
+    EXPECT_TRUE(iter);
+    EXPECT_TRUE(iter->is_valid());
+    EXPECT_TRUE(iter->has_next());
+    EXPECT_EQ(iter->next()->path, file1);
+    EXPECT_FALSE(iter->has_next());
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
+    EXPECT_EQ(alloc_entries, 0);
 
     ret = accessor.list_directory("data/", &iter);
-    ASSERT_EQ(ret, 0);
-    ASSERT_TRUE(iter->is_valid());
-    ASSERT_TRUE(iter->has_next());
-    ASSERT_EQ(iter->next()->path, file1);
-    ASSERT_FALSE(iter->has_next());
-    ASSERT_FALSE(iter->next());
+    EXPECT_EQ(ret, 0);
+    EXPECT_TRUE(iter->is_valid());
+    EXPECT_TRUE(iter->has_next());
+    EXPECT_EQ(iter->next()->path, file1);
+    EXPECT_FALSE(iter->has_next());
+    EXPECT_FALSE(iter->next());
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
+    EXPECT_EQ(alloc_entries, 0);
 
     ret = accessor.list_directory("data/100", &iter);
-    ASSERT_EQ(ret, 0);
-    ASSERT_FALSE(iter->has_next());
-    ASSERT_FALSE(iter->next());
+    EXPECT_EQ(ret, 0);
+    EXPECT_FALSE(iter->has_next());
+    EXPECT_FALSE(iter->next());
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
+    EXPECT_EQ(alloc_entries, 0);
 
     ret = accessor.delete_file(file1);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     ret = accessor.exists(file1);
-    ASSERT_EQ(ret, 1);
+    EXPECT_EQ(ret, 1);
     ret = accessor.list_directory("", &iter);
-    ASSERT_NE(ret, 0);
+    EXPECT_NE(ret, 0);
     ret = accessor.list_all(&iter);
-    ASSERT_EQ(ret, 0);
-    ASSERT_FALSE(iter->has_next());
-    ASSERT_FALSE(iter->next());
+    EXPECT_EQ(ret, 0);
+    EXPECT_FALSE(iter->has_next());
+    EXPECT_FALSE(iter->next());
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
+    EXPECT_EQ(alloc_entries, 0);
     ret = accessor.delete_file(file1);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
 
     std::vector<std::string> files;
     for (int dir = 10000; dir < 10005; ++dir) {
@@ -140,18 +141,18 @@ TEST(HdfsAccessorTest, normal) {
 
     for (auto&& file : files) {
         ret = accessor.put_file(file, "");
-        ASSERT_EQ(ret, 0);
+        EXPECT_EQ(ret, 0);
     }
 
     std::unordered_set<std::string> list_files;
     ret = accessor.list_all(&iter);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     for (auto file = iter->next(); file.has_value(); file = iter->next()) {
         list_files.insert(std::move(file->path));
     }
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
-    ASSERT_EQ(list_files.size(), files.size());
+    EXPECT_EQ(alloc_entries, 0);
+    EXPECT_EQ(list_files.size(), files.size());
     for (auto&& file : files) {
         EXPECT_TRUE(list_files.contains(file));
     }
@@ -163,69 +164,162 @@ TEST(HdfsAccessorTest, normal) {
         files.pop_back();
     }
     ret = accessor.delete_files(to_delete_files);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
 
     ret = accessor.list_all(&iter);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     list_files.clear();
     for (auto file = iter->next(); file.has_value(); file = iter->next()) {
         list_files.insert(std::move(file->path));
     }
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
-    ASSERT_EQ(list_files.size(), files.size());
+    EXPECT_EQ(alloc_entries, 0);
+    EXPECT_EQ(list_files.size(), files.size());
     for (auto&& file : files) {
         EXPECT_TRUE(list_files.contains(file));
     }
 
     std::string to_delete_dir = "data/10001";
     ret = accessor.delete_directory(to_delete_dir);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     ret = accessor.list_directory(to_delete_dir, &iter);
-    ASSERT_EQ(ret, 0);
-    ASSERT_FALSE(iter->has_next());
+    EXPECT_EQ(ret, 0);
+    EXPECT_FALSE(iter->has_next());
 
     files.erase(std::remove_if(files.begin(), files.end(),
                                [&](auto&& file) { return 
file.starts_with(to_delete_dir); }),
                 files.end());
     ret = accessor.list_all(&iter);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     list_files.clear();
     for (auto file = iter->next(); file.has_value(); file = iter->next()) {
         list_files.insert(std::move(file->path));
     }
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
-    ASSERT_EQ(list_files.size(), files.size());
+    EXPECT_EQ(alloc_entries, 0);
+    EXPECT_EQ(list_files.size(), files.size());
     for (auto&& file : files) {
         EXPECT_TRUE(list_files.contains(file));
     }
 
     std::string to_delete_prefix = "data/10003/";
     ret = accessor.delete_directory(to_delete_prefix);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     files.erase(std::remove_if(files.begin(), files.end(),
                                [&](auto&& file) { return 
file.starts_with(to_delete_prefix); }),
                 files.end());
     ret = accessor.list_all(&iter);
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     list_files.clear();
     for (auto file = iter->next(); file.has_value(); file = iter->next()) {
         list_files.insert(std::move(file->path));
     }
     iter.reset();
-    ASSERT_EQ(alloc_entries, 0);
-    ASSERT_EQ(list_files.size(), files.size());
+    EXPECT_EQ(alloc_entries, 0);
+    EXPECT_EQ(list_files.size(), files.size());
     for (auto&& file : files) {
         EXPECT_TRUE(list_files.contains(file));
     }
 
     ret = accessor.delete_all();
-    ASSERT_EQ(ret, 0);
+    EXPECT_EQ(ret, 0);
     ret = accessor.list_all(&iter);
-    ASSERT_EQ(ret, 0);
-    ASSERT_FALSE(iter->has_next());
-    ASSERT_FALSE(iter->next());
+    EXPECT_EQ(ret, 0);
+    EXPECT_FALSE(iter->has_next());
+    EXPECT_FALSE(iter->next());
+}
+
+TEST(HdfsAccessorTest, delete_prefix) {
+    HdfsVaultInfo info;
+    info.set_prefix(config::test_hdfs_prefix + "/HdfsAccessorTest/" + 
butil::GenerateGUID());
+    auto* conf = info.mutable_build_conf();
+    conf->set_fs_name(config::test_hdfs_fs_name);
+
+    HdfsAccessor accessor(info);
+    int ret = accessor.init();
+    EXPECT_EQ(ret, 0);
+
+    auto put_and_verify = [&accessor](const std::string& file) {
+        int ret = accessor.put_file(file, "");
+        EXPECT_EQ(ret, 0);
+        ret = accessor.exists(file);
+        EXPECT_EQ(ret, 0);
+    };
+
+    ret = accessor.delete_directory("");
+    EXPECT_NE(ret, 0);
+    ret = accessor.delete_all();
+    EXPECT_EQ(ret, 0);
+
+    put_and_verify("data/10000/1_0.dat");
+    put_and_verify("data/10000/2_0.dat");
+    put_and_verify("data/10000/20000/1_0.dat");
+    put_and_verify("data/10000/20000/30000/1_0.dat");
+    put_and_verify("data/20000/1_0.dat");
+    put_and_verify("data111/10000/1_0.dat");
+
+    ret = accessor.delete_prefix("data/10000/1_");
+    EXPECT_EQ(ret, 0);
+    ret = accessor.delete_prefix("data/10000/2_");
+    EXPECT_EQ(ret, 0);
+
+    std::unordered_set<std::string> list_files;
+    std::unique_ptr<ListIterator> iter;
+    ret = accessor.list_all(&iter);
+    EXPECT_EQ(ret, 0);
+    list_files.clear();
+    for (auto file = iter->next(); file.has_value(); file = iter->next()) {
+        list_files.insert(std::move(file->path));
+    }
+    EXPECT_EQ(list_files.size(), 4);
+    EXPECT_TRUE(list_files.contains("data/10000/20000/1_0.dat"));
+    EXPECT_TRUE(list_files.contains("data/10000/20000/30000/1_0.dat"));
+    EXPECT_TRUE(list_files.contains("data/20000/1_0.dat"));
+    EXPECT_TRUE(list_files.contains("data111/10000/1_0.dat"));
+
+    ret = accessor.delete_prefix("data/10000/1_");
+    EXPECT_EQ(ret, 0);
+    ret = accessor.delete_prefix("data/10000/2_");
+    EXPECT_EQ(ret, 0);
+    ret = accessor.delete_prefix("data/20000/1_");
+    EXPECT_EQ(ret, 0);
+
+    iter.reset();
+    ret = accessor.list_all(&iter);
+    EXPECT_EQ(ret, 0);
+    list_files.clear();
+    for (auto file = iter->next(); file.has_value(); file = iter->next()) {
+        list_files.insert(std::move(file->path));
+    }
+    EXPECT_EQ(list_files.size(), 3);
+    EXPECT_TRUE(list_files.contains("data/10000/20000/30000/1_0.dat"));
+    EXPECT_TRUE(list_files.contains("data/10000/20000/1_0.dat"));
+    EXPECT_TRUE(list_files.contains("data111/10000/1_0.dat"));
+
+    ret = accessor.delete_prefix("data/10000/20000");
+    EXPECT_EQ(ret, 0);
+
+    iter.reset();
+    ret = accessor.list_all(&iter);
+    EXPECT_EQ(ret, 0);
+    list_files.clear();
+    for (auto file = iter->next(); file.has_value(); file = iter->next()) {
+        list_files.insert(std::move(file->path));
+    }
+    EXPECT_EQ(list_files.size(), 1);
+    EXPECT_TRUE(list_files.contains("data111/10000/1_0.dat"));
+
+    ret = accessor.delete_prefix("data111/10000/1_0.dat");
+    EXPECT_EQ(ret, 0);
+
+    iter.reset();
+    ret = accessor.list_all(&iter);
+    EXPECT_EQ(ret, 0);
+    list_files.clear();
+    for (auto file = iter->next(); file.has_value(); file = iter->next()) {
+        list_files.insert(std::move(file->path));
+    }
+    EXPECT_EQ(list_files.size(), 0);
 }
 
 } // namespace doris::cloud


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to