This addresses a FIXME in src/copy.c:
----
-/* FIXME: rewrite this to use a hash table so we avoid the quadratic
- performance hit that's probably noticeable only on trees deeper
- than a few hundred levels. See use of active_dir_map in remove.c */
----
The performance benefit is there, but on my machine with a PATH_MAX of
4096 it's hard to see, because the userland work `cp' does is dwarfed
by the work the kernel does on its behalf:
----
$ time src/cp.master -r a b
real 0m54.032s
user 0m3.236s <-- coreutils HEAD
sys 0m47.335s
$ time src/cp -r a c
real 0m53.475s
user 0m0.624s <-- with patch
sys 0m48.639s
----
Thanks,
Bo
From 224328e4bda44aa25cd5c98b1c13751ecea865c7 Mon Sep 17 00:00:00 2001
From: Bo Borgerson <[EMAIL PROTECTED]>
Date: Wed, 16 Apr 2008 13:44:36 -0400
Subject: [PATCH] Use a hash rather than a linked-list for cycle check in cp
* NEWS: List the change of cycle check behavior.
* src/copy.c (struct dir_info): These go in the hash.
(static Hash_table *ancestry_hash): This is the hash.
(static size_t dir_info_hasher): Simple hasher based on INO.
(static bool dir_info_comparator): Checks equivalence of INO and DEV.
(static bool ancestry_insert): Insert a dir_info entry for the current dir.
(static bool ancestry_delete): Delete the hash entry for the current dir.
(copy_internal): Now calls ancestry_insert and ancestry_delete instead of
managing the linked-list inline and calling is_ancestor for the cycle check.
Signed-off-by: Bo Borgerson <[EMAIL PROTECTED]>
---
NEWS | 2 +
src/copy.c | 106 +++++++++++++++++++++++++++++++++++++++--------------------
2 files changed, 72 insertions(+), 36 deletions(-)
diff --git a/NEWS b/NEWS
index 3a584e9..111e0c1 100644
--- a/NEWS
+++ b/NEWS
@@ -69,6 +69,8 @@ GNU coreutils NEWS -*- outline -*-
seq gives better diagnostics for invalid formats.
+ cp now uses a hash for cycle checks rather than a linked-list of ancestry.
+
** Portability
rm now works properly even on systems like BeOS and Haiku,
diff --git a/src/copy.c b/src/copy.c
index c2f21a3..8e673f0 100644
--- a/src/copy.c
+++ b/src/copy.c
@@ -83,19 +83,74 @@ rpl_mkfifo (char const *file, mode_t mode)
#define SAME_GROUP(A, B) ((A).st_gid == (B).st_gid)
#define SAME_OWNER_AND_GROUP(A, B) (SAME_OWNER (A, B) && SAME_GROUP (A, B))
-struct dir_list
+
+/* This is for looking up directories by inode/device to ensure we don't
+ have any cycles. */
+static Hash_table *ancestors_hash;
+
+enum { INIT_ANCESTRY_SIZE = 47 };
+
+struct dir_info
{
- struct dir_list *parent;
ino_t ino;
dev_t dev;
};
+static size_t
+dir_info_hasher (const void *entry, size_t tabsize)
+{
+ const struct dir_info *node = entry;
+ return node->ino % tabsize;
+}
+
+static bool
+dir_info_comparator (const void *e1, const void *e2)
+{
+ const struct dir_info *n1 = e1, *n2 = e2;
+ return n1->ino == n2->ino && n1->dev == n2->dev;
+}
+
+/* First check whether this directory has been seen. If it has then
+ return false because we've encountered a cycle in the graph. If it
+ hasn't, then insert it into the ancestry hash and return true. */
+
+static bool
+ancestry_insert (struct dir_info *dir, struct stat *src_sb)
+{
+ dir->ino = src_sb->st_ino;
+ dir->dev = src_sb->st_dev;
+
+ if (! ancestors_hash)
+ {
+ ancestors_hash = hash_initialize (INIT_ANCESTRY_SIZE, NULL,
+ dir_info_hasher,
+ dir_info_comparator, NULL);
+ if (! ancestors_hash)
+ xalloc_die ();
+ }
+
+ if (hash_lookup (ancestors_hash, dir))
+ return false;
+
+ hash_insert (ancestors_hash, dir);
+
+ return true;
+}
+
+/* This is called when recursion through DIR is complete. Note that
+ `*dir' is actually a pointer into the stack of the caller, so there's
+ no free here. */
+static inline void
+ancestry_delete (const struct dir_info *dir)
+{
+ hash_delete (ancestors_hash, dir);
+}
+
/* Initial size of the cp.dest_info hash table. */
#define DEST_INFO_INITIAL_CAPACITY 61
static bool copy_internal (char const *src_name, char const *dst_name,
bool new_dst, dev_t device,
- struct dir_list *ancestors,
const struct cp_options *x,
bool command_line_arg,
bool *copy_into_self,
@@ -110,23 +165,6 @@ static char const *top_level_dst_name;
/* The invocation name of this program. */
extern char *program_name;
-/* FIXME: describe */
-/* FIXME: rewrite this to use a hash table so we avoid the quadratic
- performance hit that's probably noticeable only on trees deeper
- than a few hundred levels. See use of active_dir_map in remove.c */
-
-static bool
-is_ancestor (const struct stat *sb, const struct dir_list *ancestors)
-{
- while (ancestors != 0)
- {
- if (ancestors->ino == sb->st_ino && ancestors->dev == sb->st_dev)
- return true;
- ancestors = ancestors->parent;
- }
- return false;
-}
-
/* Read the contents of the directory SRC_NAME_IN, and recursively
copy the contents to DST_NAME_IN. NEW_DST is true if
DST_NAME_IN is a directory that was created previously in the
@@ -137,8 +175,8 @@ is_ancestor (const struct stat *sb, const struct dir_list *ancestors)
static bool
copy_dir (char const *src_name_in, char const *dst_name_in, bool new_dst,
- const struct stat *src_sb, struct dir_list *ancestors,
- const struct cp_options *x, bool *copy_into_self)
+ const struct stat *src_sb, const struct cp_options *x,
+ bool *copy_into_self)
{
char *name_space;
char *namep;
@@ -167,7 +205,7 @@ copy_dir (char const *src_name_in, char const *dst_name_in, bool new_dst,
char *dst_name = file_name_concat (dst_name_in, namep, NULL);
ok &= copy_internal (src_name, dst_name, new_dst, src_sb->st_dev,
- ancestors, &non_command_line_options, false,
+ &non_command_line_options, false,
&local_copy_into_self, NULL);
*copy_into_self |= local_copy_into_self;
@@ -1056,7 +1094,6 @@ static bool
copy_internal (char const *src_name, char const *dst_name,
bool new_dst,
dev_t device,
- struct dir_list *ancestors,
const struct cp_options *x,
bool command_line_arg,
bool *copy_into_self,
@@ -1673,27 +1710,21 @@ copy_internal (char const *src_name, char const *dst_name,
if (S_ISDIR (src_mode))
{
- struct dir_list *dir;
+ struct dir_info dir;
- /* If this directory has been copied before during the
+ /* Insert the current directory in the list of parents.
+ If this directory has been copied before during the
recursion, there is a symbolic link to an ancestor
directory of the symbolic link. It is impossible to
continue to copy this, unless we've got an infinite disk. */
- if (is_ancestor (&src_sb, ancestors))
+ if (! ancestry_insert (&dir, &src_sb))
{
error (0, 0, _("cannot copy cyclic symbolic link %s"),
quote (src_name));
goto un_backup;
}
- /* Insert the current directory in the list of parents. */
-
- dir = alloca (sizeof *dir);
- dir->parent = ancestors;
- dir->ino = src_sb.st_ino;
- dir->dev = src_sb.st_dev;
-
if (new_dst || !S_ISDIR (dst_sb.st_mode))
{
/* POSIX says mkdir's behavior is implementation-defined when
@@ -1753,9 +1784,12 @@ copy_internal (char const *src_name, char const *dst_name,
this fails -- otherwise, the failure to read a single file
in a source directory would cause the containing destination
directory not to have owner/perms set properly. */
- delayed_ok = copy_dir (src_name, dst_name, new_dst, &src_sb, dir, x,
+ delayed_ok = copy_dir (src_name, dst_name, new_dst, &src_sb, x,
copy_into_self);
}
+
+ /* This whole branch has been copied now. */
+ ancestry_delete (&dir);
}
else if (x->symbolic_link)
{
@@ -2097,7 +2131,7 @@ copy (char const *src_name, char const *dst_name,
top_level_src_name = src_name;
top_level_dst_name = dst_name;
- return copy_internal (src_name, dst_name, nonexistent_dst, 0, NULL,
+ return copy_internal (src_name, dst_name, nonexistent_dst, 0,
options, true, copy_into_self, rename_succeeded);
}
--
1.5.2.5
_______________________________________________
Bug-coreutils mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/bug-coreutils