Re: [Toy PATCH] Avoid spilling collided entries in object hash table to the next slots

2013-03-29 Thread Duy Nguyen
On Sat, Mar 30, 2013 at 12:15 AM, Junio C Hamano  wrote:
> Nguyễn Thái Ngọc Duy   writes:
>
>> If a position in object hash table is taken, we currently check out
>> the next one. This could potentially create long object chains. We
>> could create linked lists instead and leave the next slot alone.
>
> In the current code, not just the logic in lookup_object(), but the
> logic to enforce load factor when create_object() decides to call
> grow_object_hash() and object enumeration implemented by
> get_max_object_index() and get_indexed_object() are closely tied to
> the open addressing scheme.  If you want to switch to any other
> method (e.g. separate chaining) these need to be updated quite a
> bit.
>
> I do not see get_max_object_index() and get_indexed_object() updated
> at all.  Do fsck, index-pack, name-rev and upload-pack still work?

No, apparently not. I should have been hit hard by not updating
grow_object_hash(). But I think it was ok because I was on top of my
preallocation patch and there probably were not many chains before
that patch kicked in.

> This particular implementation that uses a fake "union" is a bit
> ugly, but in general, it may be worth rethinking the object hash
> implementation.  I vaguely recall trying cuckoo in the vicinity of
> this code.

Yeah I saw that. Need to read and maybe try again some time. Still
playing with the linked list idea. If every time we find something in
a chain, we swap it to top (with hope it'll be accessed more often),
then the number of traversing in chains goes down from 33m times to
21.5m times. If we just push it up one node in the linked list, it's
21.3m times. Interesting.
--
Duy
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Toy PATCH] Avoid spilling collided entries in object hash table to the next slots

2013-03-29 Thread Junio C Hamano
Junio C Hamano  writes:

> Nguyễn Thái Ngọc Duy   writes:
>
>> If a position in object hash table is taken, we currently check out
>> the next one. This could potentially create long object chains. We
>> could create linked lists instead and leave the next slot alone.
>
> In the current code, not just the logic in lookup_object(), but the
> logic to enforce load factor when create_object() decides to call
> grow_object_hash() and object enumeration implemented by
> get_max_object_index() and get_indexed_object() are closely tied to
> the open addressing scheme.  If you want to switch to any other
> method (e.g. separate chaining) these need to be updated quite a
> bit.
>
> I do not see get_max_object_index() and get_indexed_object() updated
> at all.  Do fsck, index-pack, name-rev and upload-pack still work?

You may want to start with a bit more abstraction around the
hashtable API.  Perhaps like this?

The idea is to let your object enumerator to be not just a simple
unsigned int into the flat hashtable, but be something like

typedef struct {
unsigned int slot;
struct obj_list *list;
} object_enumerator;

You store the current index in obj_hash[] to enu.slot, and if that
is IS_LST(), the linked-list element you are looking at in enu.list.
When you "increment" the iterator in object_enumerator_next(), you
increment enu.slot only after you reach the tail of the enu.list.



 builtin/fsck.c   | 17 ++---
 builtin/index-pack.c | 11 +++
 builtin/name-rev.c   | 20 +++-
 object.c | 22 +++---
 object.h |  8 ++--
 upload-pack.c| 23 ++-
 6 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/builtin/fsck.c b/builtin/fsck.c
index bb9a2cd..5688cad 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -270,22 +270,25 @@ static void check_object(struct object *obj)
 
 static void check_connectivity(void)
 {
-   int i, max;
+   int max;
+   object_enumerator enu;
 
/* Traverse the pending reachable objects */
traverse_reachable();
 
/* Look up all the requirements, warn about missing objects.. */
-   max = get_max_object_index();
+   max = begin_object_enumeration(&enu);
if (verbose)
fprintf(stderr, "Checking connectivity (%d objects)\n", max);
 
-   for (i = 0; i < max; i++) {
-   struct object *obj = get_indexed_object(i);
-
-   if (obj)
-   check_object(obj);
+   if (max) {
+   do {
+   struct object *obj = get_enumerated_object(&enu);
+   if (obj)
+   check_object(obj);
+   } while (object_enumeration_next(&enu));
}
+   end_object_enumeration(&enu);
 }
 
 static int fsck_obj(struct object *obj)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index ef62124..1d5b65c 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -195,11 +195,14 @@ static void check_object(struct object *obj)
 
 static void check_objects(void)
 {
-   unsigned i, max;
+   object_enumerator enu;
 
-   max = get_max_object_index();
-   for (i = 0; i < max; i++)
-   check_object(get_indexed_object(i));
+   if (begin_object_enumeration(&enu)) {
+   do {
+   check_object(get_enumerated_object(&enu));
+   } while (object_enumeration_next(&enu));
+   }
+   end_object_enumeration(&enu);
 }
 
 
diff --git a/builtin/name-rev.c b/builtin/name-rev.c
index 6238247..239c3ef 100644
--- a/builtin/name-rev.c
+++ b/builtin/name-rev.c
@@ -286,16 +286,18 @@ int cmd_name_rev(int argc, const char **argv, const char 
*prefix)
name_rev_line(p, &data);
}
} else if (all) {
-   int i, max;
-
-   max = get_max_object_index();
-   for (i = 0; i < max; i++) {
-   struct object *obj = get_indexed_object(i);
-   if (!obj || obj->type != OBJ_COMMIT)
-   continue;
-   show_name(obj, NULL,
- always, allow_undefined, data.name_only);
+   object_enumerator enu;
+
+   if (begin_object_enumeration(&enu)) {
+   do {
+   struct object *obj = 
get_enumerated_object(&enu);
+   if (!obj || obj->type != OBJ_COMMIT)
+   continue;
+   show_name(obj, NULL,
+ always, allow_undefined, 
data.name_only);
+   } while (object_enumeration_next(&enu));
}
+   end_object_enumeration(&enu);
} else {
int i;
for (i = 0; i < revs.nr; 

Re: [Toy PATCH] Avoid spilling collided entries in object hash table to the next slots

2013-03-29 Thread Junio C Hamano
Nguyễn Thái Ngọc Duy   writes:

> If a position in object hash table is taken, we currently check out
> the next one. This could potentially create long object chains. We
> could create linked lists instead and leave the next slot alone.

In the current code, not just the logic in lookup_object(), but the
logic to enforce load factor when create_object() decides to call
grow_object_hash() and object enumeration implemented by
get_max_object_index() and get_indexed_object() are closely tied to
the open addressing scheme.  If you want to switch to any other
method (e.g. separate chaining) these need to be updated quite a
bit.

I do not see get_max_object_index() and get_indexed_object() updated
at all.  Do fsck, index-pack, name-rev and upload-pack still work?

This particular implementation that uses a fake "union" is a bit
ugly, but in general, it may be worth rethinking the object hash
implementation.  I vaguely recall trying cuckoo in the vicinity of
this code.
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Toy PATCH] Avoid spilling collided entries in object hash table to the next slots

2013-03-29 Thread Nguyễn Thái Ngọc Duy
If a position in object hash table is taken, we currently check out
the next one. This could potentially create long object chains. We
could create linked lists instead and leave the next slot alone.

This patch relies on the fact that pointers are usually aligned more
than one byte and it uses the first bit as "object vs linked list"
indicator. Architectures that don't support this can fall back to
original implementation.

The saving is real, although not ground breaking. I'm just not sure it
is worth the ugliness that comes with this patch. Although we could
avoid the alignment problem by saving that bit in a separate bitmap.
"git rev-list --objects --all" on linux-2.6.git:

before   after
real0m33.407s0m31.732s
user0m32.926s0m31.460s
sys 0m0.407s 0m0.202s

lookup_object() goes down from 30% to 27% in perf reports.

Signed-off-by: Nguyễn Thái Ngọc Duy 
---
 object.c | 42 ++
 1 file changed, 42 insertions(+)

diff --git a/object.c b/object.c
index bcfd2c6..7b013a0 100644
--- a/object.c
+++ b/object.c
@@ -8,6 +8,17 @@
 static struct object **obj_hash;
 static int nr_objs, obj_hash_size;
 
+#ifdef USE_LINKED_LIST
+struct obj_list {
+   struct object *obj;
+   struct obj_list *next;
+};
+
+#define IS_LST(x) ((intptr_t)(x) & 1)
+#define MAKE_LST(x) (struct object *)((intptr_t)(x) | 1)
+#define GET_LST(x) (struct obj_list *)((intptr_t)(x) & ~1)
+#endif
+
 unsigned int get_max_object_index(void)
 {
return obj_hash_size;
@@ -53,13 +64,30 @@ static unsigned int hash_obj(struct object *obj, unsigned 
int n)
 static void insert_obj_hash(struct object *obj, struct object **hash, unsigned 
int size)
 {
unsigned int j = hash_obj(obj, size);
+#ifdef USE_LINKED_LIST
+   struct obj_list *lst, *o;
+
+   if (!hash[j]) {
+   hash[j] = obj;
+   return;
+   }
 
+   o = xmalloc(sizeof(*o));
+   o->obj = obj;
+
+   if (IS_LST(hash[j]))
+   o->next = GET_LST(hash[j]);
+   else
+   o->next = NULL;
+   hash[j] = MAKE_LST(o);
+#else
while (hash[j]) {
j++;
if (j >= size)
j = 0;
}
hash[j] = obj;
+#endif
 }
 
 static unsigned int hashtable_index(const unsigned char *sha1)
@@ -78,6 +106,19 @@ struct object *lookup_object(const unsigned char *sha1)
return NULL;
 
i = hashtable_index(sha1);
+#ifdef USE_LINKED_LIST
+   if (IS_LST(obj_hash[i])) {
+   struct obj_list *lst;
+   for (lst = GET_LST(obj_hash[i]); lst; lst = lst->next) {
+   if (!hashcmp(lst->obj->sha1, sha1))
+   return lst->obj;
+   }
+   return NULL;
+   } else {
+   struct object *obj = obj_hash[i];
+   return obj && !hashcmp(sha1, obj->sha1) ? obj : NULL;
+   }
+#else
while ((obj = obj_hash[i]) != NULL) {
if (!hashcmp(sha1, obj->sha1))
break;
@@ -86,6 +127,7 @@ struct object *lookup_object(const unsigned char *sha1)
i = 0;
}
return obj;
+#endif
 }
 
 void grow_object_hash_to(unsigned long nr)
-- 
1.8.2.83.gc99314b

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html