Re: [PATCH v2 2/3] count-objects: report garbage files in pack directory too
Nguyễn Thái Ngọc Duy pclo...@gmail.com writes: prepare_packed_git_one() is modified to allow count-objects to hook a report function to so we don't need to duplicate the pack searching logic in count-objects.c. When report_pack_garbage is NULL, the overhead is insignificant. Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com --- Documentation/git-count-objects.txt | 4 +- builtin/count-objects.c | 18 - sha1_file.c | 81 +++-- 3 files changed, 97 insertions(+), 6 deletions(-) diff --git a/Documentation/git-count-objects.txt b/Documentation/git-count-objects.txt index e816823..1611d7c 100644 --- a/Documentation/git-count-objects.txt +++ b/Documentation/git-count-objects.txt @@ -33,8 +33,8 @@ size-pack: disk space consumed by the packs, in KiB prune-packable: the number of loose objects that are also present in the packs. These objects could be pruned using `git prune-packed`. + -garbage: the number of files in loose object database that are not -valid loose objects +garbage: the number of files in object database that are not valid +loose objects nor valid packs GIT --- diff --git a/builtin/count-objects.c b/builtin/count-objects.c index 9afaa88..118b2ae 100644 --- a/builtin/count-objects.c +++ b/builtin/count-objects.c @@ -9,6 +9,20 @@ #include builtin.h #include parse-options.h +static unsigned long garbage; + +extern void (*report_pack_garbage)(const char *path, int len, const char *name); +static void real_report_pack_garbage(const char *path, int len, const char *name) +{ Don't some callers call this on paths outside objects/pack/ directory? Is it still report-pack-garbage? + if (len name) + error(garbage found: %.*s/%s, len, path, name); + else if (!len name) + error(garbage found: %s%s, path, name); + else + error(garbage found: %s, path); + garbage++; +} + static void count_objects(DIR *d, char *path, int len, int verbose, unsigned long *loose, off_t *loose_size, @@ -76,7 +90,7 @@ int cmd_count_objects(int argc, const char **argv, const char *prefix) const char *objdir = get_object_directory(); int len = strlen(objdir); char *path = xmalloc(len + 50); - unsigned long loose = 0, packed = 0, packed_loose = 0, garbage = 0; + unsigned long loose = 0, packed = 0, packed_loose = 0; off_t loose_size = 0; struct option opts[] = { OPT__VERBOSE(verbose, N_(be verbose)), @@ -87,6 +101,8 @@ int cmd_count_objects(int argc, const char **argv, const char *prefix) /* we do not take arguments other than flags for now */ if (argc) usage_with_options(count_objects_usage, opts); + if (verbose) + report_pack_garbage = real_report_pack_garbage; memcpy(path, objdir, len); if (len objdir[len-1] != '/') path[len++] = '/'; diff --git a/sha1_file.c b/sha1_file.c index 40b2329..cc6ef03 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -21,6 +21,7 @@ #include sha1-lookup.h #include bulk-checkin.h #include streaming.h +#include dir.h #ifndef O_NOATIME #if defined(__linux__) (defined(__i386__) || defined(__PPC__)) @@ -1000,6 +1001,54 @@ void install_packed_git(struct packed_git *pack) packed_git = pack; } +/* A hook for count-objects to report invalid files in pack directory */ +void (*report_pack_garbage)(const char *path, int len, const char *name); + +static const char *known_pack_extensions[] = { .pack, .keep, NULL }; This sounds wrong. Isn't .idx also known? +static void report_garbage(struct string_list *list) +{ + struct strbuf sb = STRBUF_INIT; + struct packed_git *p; + int i; + + if (!report_pack_garbage) + return; + + sort_string_list(list); + + for (p = packed_git; p; p = p-next) { + struct string_list_item *item; + if (!p-pack_local) + continue; + strbuf_reset(sb); + strbuf_add(sb, p-pack_name, +strlen(p-pack_name) - strlen(.pack)); + item = string_list_lookup(list, sb.buf); + if (!item) + continue; + /* + * string_list_lookup does not guarantee to return the + * first matched string if it's duplicated. + */ + while (item - list-items +!strcmp(item[-1].string, item-string)) + item--; + while (item - list-items list-nr +!strcmp(item-string, sb.buf)) { + item-util = NULL; /* non-garbage mark */ + item++; + } + } + for (i = 0; i list-nr; i++) { + struct string_list_item *item =
Re: [PATCH v2 2/3] count-objects: report garbage files in pack directory too
On Sat, Feb 9, 2013 at 1:44 AM, Junio C Hamano gits...@pobox.com wrote: +static void real_report_pack_garbage(const char *path, int len, const char *name) +{ Don't some callers call this on paths outside objects/pack/ directory? Is it still report-pack-garbage? In fact 3/3 uses it to report loose garbage. Will rename. +static const char *known_pack_extensions[] = { .pack, .keep, NULL }; This sounds wrong. Isn't .idx also known? I had a comment saying all known extensions related to a pack, except .idx but I dropped it. .idx being used as the pointer file needs to be handled separately. @@ -1024,14 +1074,37 @@ static void prepare_packed_git_one(char *objdir, int local) int namelen = strlen(de-d_name); struct packed_git *p; - if (!has_extension(de-d_name, .idx)) + if (len + namelen + 1 sizeof(path)) { + if (report_pack_garbage) + report_pack_garbage(path, len - 1, de-d_name); A pack/in/a/very/long/path/pack-.pack may pass when fed to git verify-pack, but this will report it as garbage, without reporting what is wrong with it. Wouldn't that confuse users? If all other git commands do not recognize the pack, then I think it's still considered garbage. We could either - make prepare_packed_git_one accept pack/in/a/very/long/path-... - show the reason why we consider it garbage Which option do we do? Option 1 may involve chdir in, stat stat stat and chdir out to get around short PATH_MAX limit on some system. - if (len + namelen + 1 sizeof(path)) + if (!has_extension(de-d_name, .idx)) { + struct string_list_item *item; + int i, n; + if (!report_pack_garbage) + continue; + if (is_dot_or_dotdot(de-d_name)) + continue; + for (i = 0; known_pack_extensions[i]; i++) + if (has_extension(de-d_name, + known_pack_extensions[i])) + break; + if (!known_pack_extensions[i]) { + report_pack_garbage(path, 0, NULL); + continue; + } + n = strlen(path) - strlen(known_pack_extensions[i]); + item = string_list_append_nodup(garbage, + xstrndup(path, n)); + item-util = (void*)known_pack_extensions[i]; continue; + } Why isn't this part more like this? if (dot-or-dotdot) { continue; } else if (has_extension(de-d_name, .idx)) { do things for the .idx file; } else if (has_extension(de-d_name, .pack) { do things for the .pack file, including queuing the name if we haven't seen corresponding .idx for later examination; } else if (has_extension(de-d_name, .keep) { nothing special for now but we may want to add some other checks later } else { everything else is a garbage report_pack_garbage(); } Originally I checked known_extensions again in report_pack_garbage() but after restructuring, yeah we can drop known_extensions and do it your way. Looks much clearer. -- Duy -- To unsubscribe from this list: send the line unsubscribe git in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/3] count-objects: report garbage files in pack directory too
prepare_packed_git_one() is modified to allow count-objects to hook a report function to so we don't need to duplicate the pack searching logic in count-objects.c. When report_pack_garbage is NULL, the overhead is insignificant. Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com --- Documentation/git-count-objects.txt | 4 +- builtin/count-objects.c | 18 - sha1_file.c | 81 +++-- 3 files changed, 97 insertions(+), 6 deletions(-) diff --git a/Documentation/git-count-objects.txt b/Documentation/git-count-objects.txt index e816823..1611d7c 100644 --- a/Documentation/git-count-objects.txt +++ b/Documentation/git-count-objects.txt @@ -33,8 +33,8 @@ size-pack: disk space consumed by the packs, in KiB prune-packable: the number of loose objects that are also present in the packs. These objects could be pruned using `git prune-packed`. + -garbage: the number of files in loose object database that are not -valid loose objects +garbage: the number of files in object database that are not valid +loose objects nor valid packs GIT --- diff --git a/builtin/count-objects.c b/builtin/count-objects.c index 9afaa88..118b2ae 100644 --- a/builtin/count-objects.c +++ b/builtin/count-objects.c @@ -9,6 +9,20 @@ #include builtin.h #include parse-options.h +static unsigned long garbage; + +extern void (*report_pack_garbage)(const char *path, int len, const char *name); +static void real_report_pack_garbage(const char *path, int len, const char *name) +{ + if (len name) + error(garbage found: %.*s/%s, len, path, name); + else if (!len name) + error(garbage found: %s%s, path, name); + else + error(garbage found: %s, path); + garbage++; +} + static void count_objects(DIR *d, char *path, int len, int verbose, unsigned long *loose, off_t *loose_size, @@ -76,7 +90,7 @@ int cmd_count_objects(int argc, const char **argv, const char *prefix) const char *objdir = get_object_directory(); int len = strlen(objdir); char *path = xmalloc(len + 50); - unsigned long loose = 0, packed = 0, packed_loose = 0, garbage = 0; + unsigned long loose = 0, packed = 0, packed_loose = 0; off_t loose_size = 0; struct option opts[] = { OPT__VERBOSE(verbose, N_(be verbose)), @@ -87,6 +101,8 @@ int cmd_count_objects(int argc, const char **argv, const char *prefix) /* we do not take arguments other than flags for now */ if (argc) usage_with_options(count_objects_usage, opts); + if (verbose) + report_pack_garbage = real_report_pack_garbage; memcpy(path, objdir, len); if (len objdir[len-1] != '/') path[len++] = '/'; diff --git a/sha1_file.c b/sha1_file.c index 40b2329..cc6ef03 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -21,6 +21,7 @@ #include sha1-lookup.h #include bulk-checkin.h #include streaming.h +#include dir.h #ifndef O_NOATIME #if defined(__linux__) (defined(__i386__) || defined(__PPC__)) @@ -1000,6 +1001,54 @@ void install_packed_git(struct packed_git *pack) packed_git = pack; } +/* A hook for count-objects to report invalid files in pack directory */ +void (*report_pack_garbage)(const char *path, int len, const char *name); + +static const char *known_pack_extensions[] = { .pack, .keep, NULL }; + +static void report_garbage(struct string_list *list) +{ + struct strbuf sb = STRBUF_INIT; + struct packed_git *p; + int i; + + if (!report_pack_garbage) + return; + + sort_string_list(list); + + for (p = packed_git; p; p = p-next) { + struct string_list_item *item; + if (!p-pack_local) + continue; + strbuf_reset(sb); + strbuf_add(sb, p-pack_name, + strlen(p-pack_name) - strlen(.pack)); + item = string_list_lookup(list, sb.buf); + if (!item) + continue; + /* +* string_list_lookup does not guarantee to return the +* first matched string if it's duplicated. +*/ + while (item - list-items + !strcmp(item[-1].string, item-string)) + item--; + while (item - list-items list-nr + !strcmp(item-string, sb.buf)) { + item-util = NULL; /* non-garbage mark */ + item++; + } + } + for (i = 0; i list-nr; i++) { + struct string_list_item *item = list-items + i; + if (!item-util) + continue; + report_pack_garbage(item-string, 0, item-util); + } + strbuf_release(sb); +} + static void