Re: [PATCH v2 2/3] count-objects: report garbage files in pack directory too

2013-02-08 Thread Junio C Hamano
Nguyễn Thái Ngọc Duy  pclo...@gmail.com writes:

 prepare_packed_git_one() is modified to allow count-objects to hook a
 report function to so we don't need to duplicate the pack searching
 logic in count-objects.c. When report_pack_garbage is NULL, the
 overhead is insignificant.

 Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
 ---
  Documentation/git-count-objects.txt |  4 +-
  builtin/count-objects.c | 18 -
  sha1_file.c | 81 
 +++--
  3 files changed, 97 insertions(+), 6 deletions(-)

 diff --git a/Documentation/git-count-objects.txt 
 b/Documentation/git-count-objects.txt
 index e816823..1611d7c 100644
 --- a/Documentation/git-count-objects.txt
 +++ b/Documentation/git-count-objects.txt
 @@ -33,8 +33,8 @@ size-pack: disk space consumed by the packs, in KiB
  prune-packable: the number of loose objects that are also present in
  the packs. These objects could be pruned using `git prune-packed`.
  +
 -garbage: the number of files in loose object database that are not
 -valid loose objects
 +garbage: the number of files in object database that are not valid
 +loose objects nor valid packs
  
  GIT
  ---
 diff --git a/builtin/count-objects.c b/builtin/count-objects.c
 index 9afaa88..118b2ae 100644
 --- a/builtin/count-objects.c
 +++ b/builtin/count-objects.c
 @@ -9,6 +9,20 @@
  #include builtin.h
  #include parse-options.h
  
 +static unsigned long garbage;
 +
 +extern void (*report_pack_garbage)(const char *path, int len, const char 
 *name);
 +static void real_report_pack_garbage(const char *path, int len, const char 
 *name)
 +{

Don't some callers call this on paths outside objects/pack/
directory?  Is it still report-pack-garbage?

 + if (len  name)
 + error(garbage found: %.*s/%s, len, path, name);
 + else if (!len  name)
 + error(garbage found: %s%s, path, name);
 + else
 + error(garbage found: %s, path);
 + garbage++;
 +}
 +
  static void count_objects(DIR *d, char *path, int len, int verbose,
 unsigned long *loose,
 off_t *loose_size,
 @@ -76,7 +90,7 @@ int cmd_count_objects(int argc, const char **argv, const 
 char *prefix)
   const char *objdir = get_object_directory();
   int len = strlen(objdir);
   char *path = xmalloc(len + 50);
 - unsigned long loose = 0, packed = 0, packed_loose = 0, garbage = 0;
 + unsigned long loose = 0, packed = 0, packed_loose = 0;
   off_t loose_size = 0;
   struct option opts[] = {
   OPT__VERBOSE(verbose, N_(be verbose)),
 @@ -87,6 +101,8 @@ int cmd_count_objects(int argc, const char **argv, const 
 char *prefix)
   /* we do not take arguments other than flags for now */
   if (argc)
   usage_with_options(count_objects_usage, opts);
 + if (verbose)
 + report_pack_garbage = real_report_pack_garbage;
   memcpy(path, objdir, len);
   if (len  objdir[len-1] != '/')
   path[len++] = '/';
 diff --git a/sha1_file.c b/sha1_file.c
 index 40b2329..cc6ef03 100644
 --- a/sha1_file.c
 +++ b/sha1_file.c
 @@ -21,6 +21,7 @@
  #include sha1-lookup.h
  #include bulk-checkin.h
  #include streaming.h
 +#include dir.h
  
  #ifndef O_NOATIME
  #if defined(__linux__)  (defined(__i386__) || defined(__PPC__))
 @@ -1000,6 +1001,54 @@ void install_packed_git(struct packed_git *pack)
   packed_git = pack;
  }
  
 +/* A hook for count-objects to report invalid files in pack directory */
 +void (*report_pack_garbage)(const char *path, int len, const char *name);
 +
 +static const char *known_pack_extensions[] = { .pack, .keep, NULL };

This sounds wrong.  Isn't .idx also known?

 +static void report_garbage(struct string_list *list)
 +{
 + struct strbuf sb = STRBUF_INIT;
 + struct packed_git *p;
 + int i;
 +
 + if (!report_pack_garbage)
 + return;
 +
 + sort_string_list(list);
 +
 + for (p = packed_git; p; p = p-next) {
 + struct string_list_item *item;
 + if (!p-pack_local)
 + continue;
 + strbuf_reset(sb);
 + strbuf_add(sb, p-pack_name,
 +strlen(p-pack_name) - strlen(.pack));
 + item = string_list_lookup(list, sb.buf);
 + if (!item)
 + continue;
 + /*
 +  * string_list_lookup does not guarantee to return the
 +  * first matched string if it's duplicated.
 +  */
 + while (item - list-items 
 +!strcmp(item[-1].string, item-string))
 + item--;
 + while (item - list-items  list-nr 
 +!strcmp(item-string, sb.buf)) {
 + item-util = NULL; /* non-garbage mark */
 + item++;
 + }
 + }
 + for (i = 0; i  list-nr; i++) {
 + struct string_list_item *item = 

Re: [PATCH v2 2/3] count-objects: report garbage files in pack directory too

2013-02-08 Thread Duy Nguyen
On Sat, Feb 9, 2013 at 1:44 AM, Junio C Hamano gits...@pobox.com wrote:
 +static void real_report_pack_garbage(const char *path, int len, const char 
 *name)
 +{

 Don't some callers call this on paths outside objects/pack/
 directory?  Is it still report-pack-garbage?

In fact 3/3 uses it to report loose garbage. Will rename.

 +static const char *known_pack_extensions[] = { .pack, .keep, NULL };

 This sounds wrong.  Isn't .idx also known?

I had a comment saying all known extensions related to a pack, except
.idx but I dropped it. .idx being used as the pointer file needs to
be handled separately.

 @@ -1024,14 +1074,37 @@ static void prepare_packed_git_one(char *objdir, int 
 local)
   int namelen = strlen(de-d_name);
   struct packed_git *p;

 - if (!has_extension(de-d_name, .idx))
 + if (len + namelen + 1  sizeof(path)) {
 + if (report_pack_garbage)
 + report_pack_garbage(path, len - 1, de-d_name);

 A pack/in/a/very/long/path/pack-.pack
 may pass when fed to git verify-pack, but this will report it as garbage,
 without reporting what is wrong with it.  Wouldn't that confuse users?

If all other git commands do not recognize the pack, then I think it's
still considered garbage. We could either

 - make prepare_packed_git_one accept pack/in/a/very/long/path-...
 - show the reason why we consider it garbage

Which option do we do? Option 1 may involve chdir in, stat stat stat
and chdir out to get around short PATH_MAX limit on some system.

 - if (len + namelen + 1  sizeof(path))
 + if (!has_extension(de-d_name, .idx)) {
 + struct string_list_item *item;
 + int i, n;
 + if (!report_pack_garbage)
 + continue;
 + if (is_dot_or_dotdot(de-d_name))
 + continue;
 + for (i = 0; known_pack_extensions[i]; i++)
 + if (has_extension(de-d_name,
 +   known_pack_extensions[i]))
 + break;
 + if (!known_pack_extensions[i]) {
 + report_pack_garbage(path, 0, NULL);
 + continue;
 + }
 + n = strlen(path) - strlen(known_pack_extensions[i]);
 + item = string_list_append_nodup(garbage,
 + xstrndup(path, n));
 + item-util = (void*)known_pack_extensions[i];
   continue;
 + }

 Why isn't this part more like this?

 if (dot-or-dotdot) {
 continue;
 } else if (has_extension(de-d_name, .idx)) {
 do things for the .idx file;
 } else if (has_extension(de-d_name, .pack) {
 do things for the .pack file, including
 queuing the name if we haven't seen
 corresponding .idx for later examination;
 } else if (has_extension(de-d_name, .keep) {
 nothing special for now but we may
 want to add some other checks later
 } else {
 everything else is a garbage
 report_pack_garbage();
 }

Originally I checked known_extensions again in report_pack_garbage()
but after restructuring, yeah we can drop known_extensions and do it
your way. Looks much clearer.
-- 
Duy
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/3] count-objects: report garbage files in pack directory too

2013-02-07 Thread Nguyễn Thái Ngọc Duy
prepare_packed_git_one() is modified to allow count-objects to hook a
report function to so we don't need to duplicate the pack searching
logic in count-objects.c. When report_pack_garbage is NULL, the
overhead is insignificant.

Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
---
 Documentation/git-count-objects.txt |  4 +-
 builtin/count-objects.c | 18 -
 sha1_file.c | 81 +++--
 3 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/Documentation/git-count-objects.txt 
b/Documentation/git-count-objects.txt
index e816823..1611d7c 100644
--- a/Documentation/git-count-objects.txt
+++ b/Documentation/git-count-objects.txt
@@ -33,8 +33,8 @@ size-pack: disk space consumed by the packs, in KiB
 prune-packable: the number of loose objects that are also present in
 the packs. These objects could be pruned using `git prune-packed`.
 +
-garbage: the number of files in loose object database that are not
-valid loose objects
+garbage: the number of files in object database that are not valid
+loose objects nor valid packs
 
 GIT
 ---
diff --git a/builtin/count-objects.c b/builtin/count-objects.c
index 9afaa88..118b2ae 100644
--- a/builtin/count-objects.c
+++ b/builtin/count-objects.c
@@ -9,6 +9,20 @@
 #include builtin.h
 #include parse-options.h
 
+static unsigned long garbage;
+
+extern void (*report_pack_garbage)(const char *path, int len, const char 
*name);
+static void real_report_pack_garbage(const char *path, int len, const char 
*name)
+{
+   if (len  name)
+   error(garbage found: %.*s/%s, len, path, name);
+   else if (!len  name)
+   error(garbage found: %s%s, path, name);
+   else
+   error(garbage found: %s, path);
+   garbage++;
+}
+
 static void count_objects(DIR *d, char *path, int len, int verbose,
  unsigned long *loose,
  off_t *loose_size,
@@ -76,7 +90,7 @@ int cmd_count_objects(int argc, const char **argv, const char 
*prefix)
const char *objdir = get_object_directory();
int len = strlen(objdir);
char *path = xmalloc(len + 50);
-   unsigned long loose = 0, packed = 0, packed_loose = 0, garbage = 0;
+   unsigned long loose = 0, packed = 0, packed_loose = 0;
off_t loose_size = 0;
struct option opts[] = {
OPT__VERBOSE(verbose, N_(be verbose)),
@@ -87,6 +101,8 @@ int cmd_count_objects(int argc, const char **argv, const 
char *prefix)
/* we do not take arguments other than flags for now */
if (argc)
usage_with_options(count_objects_usage, opts);
+   if (verbose)
+   report_pack_garbage = real_report_pack_garbage;
memcpy(path, objdir, len);
if (len  objdir[len-1] != '/')
path[len++] = '/';
diff --git a/sha1_file.c b/sha1_file.c
index 40b2329..cc6ef03 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -21,6 +21,7 @@
 #include sha1-lookup.h
 #include bulk-checkin.h
 #include streaming.h
+#include dir.h
 
 #ifndef O_NOATIME
 #if defined(__linux__)  (defined(__i386__) || defined(__PPC__))
@@ -1000,6 +1001,54 @@ void install_packed_git(struct packed_git *pack)
packed_git = pack;
 }
 
+/* A hook for count-objects to report invalid files in pack directory */
+void (*report_pack_garbage)(const char *path, int len, const char *name);
+
+static const char *known_pack_extensions[] = { .pack, .keep, NULL };
+
+static void report_garbage(struct string_list *list)
+{
+   struct strbuf sb = STRBUF_INIT;
+   struct packed_git *p;
+   int i;
+
+   if (!report_pack_garbage)
+   return;
+
+   sort_string_list(list);
+
+   for (p = packed_git; p; p = p-next) {
+   struct string_list_item *item;
+   if (!p-pack_local)
+   continue;
+   strbuf_reset(sb);
+   strbuf_add(sb, p-pack_name,
+  strlen(p-pack_name) - strlen(.pack));
+   item = string_list_lookup(list, sb.buf);
+   if (!item)
+   continue;
+   /*
+* string_list_lookup does not guarantee to return the
+* first matched string if it's duplicated.
+*/
+   while (item - list-items 
+  !strcmp(item[-1].string, item-string))
+   item--;
+   while (item - list-items  list-nr 
+  !strcmp(item-string, sb.buf)) {
+   item-util = NULL; /* non-garbage mark */
+   item++;
+   }
+   }
+   for (i = 0; i  list-nr; i++) {
+   struct string_list_item *item = list-items + i;
+   if (!item-util)
+   continue;
+   report_pack_garbage(item-string, 0, item-util);
+   }
+   strbuf_release(sb);
+}
+
 static void