Re: [PATCH 01/23] pack v4: initial pack dictionary structure and code

2013-08-27 Thread Junio C Hamano
Nicolas Pitre n...@fluxnic.net writes:

 Signed-off-by: Nicolas Pitre n...@fluxnic.net
 ---

Was there a reason not to reuse the hash-table Linus did in
hash.[ch]?

It may not make much of a difference for something so small and
isolated from the rest of the system, but if hash.[ch] can be easily
fixed with a small tweak to suit the use by this subsystem better,
it might be worth reusing the existing code with improvement, which
may help other potential users.

  packv4-create.c | 137 
 
  1 file changed, 137 insertions(+)
  create mode 100644 packv4-create.c

 diff --git a/packv4-create.c b/packv4-create.c
 new file mode 100644
 index 000..2de6d41
 --- /dev/null
 +++ b/packv4-create.c
 @@ -0,0 +1,137 @@
 +/*
 + * packv4-create.c: management of dictionary tables used in pack v4
 + *
 + * (C) Nicolas Pitre n...@fluxnic.net
 + *
 + * This code is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License version 2 as
 + * published by the Free Software Foundation.
 + */
 +
 +#include cache.h
 +
 +struct data_entry {
 + unsigned offset;
 + unsigned hits;
 +};
 +
 +struct dict_table {
 + char *data;
 + unsigned ptr;
 + unsigned size;
 + struct data_entry *entry;
 + unsigned nb_entries;
 + unsigned max_entries;
 + unsigned *hash;
 + unsigned hash_size;
 +};
 +
 +struct dict_table *create_dict_table(void)
 +{
 + return xcalloc(sizeof(struct dict_table), 1);
 +}
 +
 +void destroy_dict_table(struct dict_table *t)
 +{
 + free(t-data);
 + free(t-entry);
 + free(t-hash);
 + free(t);
 +}
 +
 +static int locate_entry(struct dict_table *t, const char *str)
 +{
 + int i = 0;
 + const unsigned char *s = (const unsigned char *) str;
 +
 + while (*s)
 + i = i * 111 + *s++;
 + i = (unsigned)i % t-hash_size;
 +
 + while (t-hash[i]) {
 + unsigned n = t-hash[i] - 1;
 + if (!strcmp(str, t-data + t-entry[n].offset))
 + return n;
 + if (++i = t-hash_size)
 + i = 0;
 + }
 + return -1 - i;
 +}
 +
 +static void rehash_entries(struct dict_table *t)
 +{
 + unsigned n;
 +
 + t-hash_size *= 2;
 + if (t-hash_size  1024)
 + t-hash_size = 1024;
 + t-hash = xrealloc(t-hash, t-hash_size * sizeof(*t-hash));
 + memset(t-hash, 0, t-hash_size * sizeof(*t-hash));
 +
 + for (n = 0; n  t-nb_entries; n++) {
 + int i = locate_entry(t, t-data + t-entry[n].offset);
 + if (i  0)
 + t-hash[-1 - i] = n + 1;
 + }
 +}
 +
 +int dict_add_entry(struct dict_table *t, const char *str)
 +{
 + int i, len = strlen(str) + 1;
 +
 + if (t-ptr + len = t-size) {
 + t-size = (t-size + len + 1024) * 3 / 2;
 + t-data = xrealloc(t-data, t-size);
 + }
 + memcpy(t-data + t-ptr, str, len);
 +
 + i = (t-nb_entries) ? locate_entry(t, t-data + t-ptr) : -1;
 + if (i = 0) {
 + t-entry[i].hits++;
 + return i;
 + }
 +
 + if (t-nb_entries = t-max_entries) {
 + t-max_entries = (t-max_entries + 1024) * 3 / 2;
 + t-entry = xrealloc(t-entry, t-max_entries * 
 sizeof(*t-entry));
 + }
 + t-entry[t-nb_entries].offset = t-ptr;
 + t-entry[t-nb_entries].hits = 1;
 + t-ptr += len + 1;
 + t-nb_entries++;
 +
 + if (t-hash_size * 3 = t-nb_entries * 4)
 + rehash_entries(t);
 + else
 + t-hash[-1 - i] = t-nb_entries;
 +
 + return t-nb_entries - 1;
 +}
 +
 +static int cmp_dict_entries(const void *a_, const void *b_)
 +{
 + const struct data_entry *a = a_;
 + const struct data_entry *b = b_;
 + int diff = b-hits - a-hits;
 + if (!diff)
 + diff = a-offset - b-offset;
 + return diff;
 +}
 +
 +static void sort_dict_entries_by_hits(struct dict_table *t)
 +{
 + qsort(t-entry, t-nb_entries, sizeof(*t-entry), cmp_dict_entries);
 + t-hash_size = (t-nb_entries * 4 / 3) / 2;
 + rehash_entries(t);
 +}
 +
 +void dict_dump(struct dict_table *t)
 +{
 + int i;
 +
 + sort_dict_entries_by_hits(t);
 + for (i = 0; i  t-nb_entries; i++)
 + printf(%d\t%s\n,
 + t-entry[i].hits,
 + t-data + t-entry[i].offset);
 +}
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 01/23] pack v4: initial pack dictionary structure and code

2013-08-27 Thread Nicolas Pitre
On Tue, 27 Aug 2013, Junio C Hamano wrote:

 Nicolas Pitre n...@fluxnic.net writes:
 
  Signed-off-by: Nicolas Pitre n...@fluxnic.net
  ---
 
 Was there a reason not to reuse the hash-table Linus did in
 hash.[ch]?

Well... Most likely because when I started that code (which used to be 
quite different initially) it might not have been served correctly by 
hash.c, or any other reasons I long have forgotten by now which might or 
might not still be valid.

 It may not make much of a difference for something so small and
 isolated from the rest of the system, but if hash.[ch] can be easily
 fixed with a small tweak to suit the use by this subsystem better,
 it might be worth reusing the existing code with improvement, which
 may help other potential users.

Absolutely.  If someone wants to give a hand in that direction I'll 
happily integrate patches into my series.  I cannot promise I'll do the 
work myself as I prefer spending the time I have available on actually 
making pack v4 usable.


Nicolas
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/23] pack v4: initial pack dictionary structure and code

2013-08-26 Thread Nicolas Pitre
Signed-off-by: Nicolas Pitre n...@fluxnic.net
---
 packv4-create.c | 137 
 1 file changed, 137 insertions(+)
 create mode 100644 packv4-create.c

diff --git a/packv4-create.c b/packv4-create.c
new file mode 100644
index 000..2de6d41
--- /dev/null
+++ b/packv4-create.c
@@ -0,0 +1,137 @@
+/*
+ * packv4-create.c: management of dictionary tables used in pack v4
+ *
+ * (C) Nicolas Pitre n...@fluxnic.net
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include cache.h
+
+struct data_entry {
+   unsigned offset;
+   unsigned hits;
+};
+
+struct dict_table {
+   char *data;
+   unsigned ptr;
+   unsigned size;
+   struct data_entry *entry;
+   unsigned nb_entries;
+   unsigned max_entries;
+   unsigned *hash;
+   unsigned hash_size;
+};
+
+struct dict_table *create_dict_table(void)
+{
+   return xcalloc(sizeof(struct dict_table), 1);
+}
+
+void destroy_dict_table(struct dict_table *t)
+{
+   free(t-data);
+   free(t-entry);
+   free(t-hash);
+   free(t);
+}
+
+static int locate_entry(struct dict_table *t, const char *str)
+{
+   int i = 0;
+   const unsigned char *s = (const unsigned char *) str;
+
+   while (*s)
+   i = i * 111 + *s++;
+   i = (unsigned)i % t-hash_size;
+
+   while (t-hash[i]) {
+   unsigned n = t-hash[i] - 1;
+   if (!strcmp(str, t-data + t-entry[n].offset))
+   return n;
+   if (++i = t-hash_size)
+   i = 0;
+   }
+   return -1 - i;
+}
+
+static void rehash_entries(struct dict_table *t)
+{
+   unsigned n;
+
+   t-hash_size *= 2;
+   if (t-hash_size  1024)
+   t-hash_size = 1024;
+   t-hash = xrealloc(t-hash, t-hash_size * sizeof(*t-hash));
+   memset(t-hash, 0, t-hash_size * sizeof(*t-hash));
+
+   for (n = 0; n  t-nb_entries; n++) {
+   int i = locate_entry(t, t-data + t-entry[n].offset);
+   if (i  0)
+   t-hash[-1 - i] = n + 1;
+   }
+}
+
+int dict_add_entry(struct dict_table *t, const char *str)
+{
+   int i, len = strlen(str) + 1;
+
+   if (t-ptr + len = t-size) {
+   t-size = (t-size + len + 1024) * 3 / 2;
+   t-data = xrealloc(t-data, t-size);
+   }
+   memcpy(t-data + t-ptr, str, len);
+
+   i = (t-nb_entries) ? locate_entry(t, t-data + t-ptr) : -1;
+   if (i = 0) {
+   t-entry[i].hits++;
+   return i;
+   }
+
+   if (t-nb_entries = t-max_entries) {
+   t-max_entries = (t-max_entries + 1024) * 3 / 2;
+   t-entry = xrealloc(t-entry, t-max_entries * 
sizeof(*t-entry));
+   }
+   t-entry[t-nb_entries].offset = t-ptr;
+   t-entry[t-nb_entries].hits = 1;
+   t-ptr += len + 1;
+   t-nb_entries++;
+
+   if (t-hash_size * 3 = t-nb_entries * 4)
+   rehash_entries(t);
+   else
+   t-hash[-1 - i] = t-nb_entries;
+
+   return t-nb_entries - 1;
+}
+
+static int cmp_dict_entries(const void *a_, const void *b_)
+{
+   const struct data_entry *a = a_;
+   const struct data_entry *b = b_;
+   int diff = b-hits - a-hits;
+   if (!diff)
+   diff = a-offset - b-offset;
+   return diff;
+}
+
+static void sort_dict_entries_by_hits(struct dict_table *t)
+{
+   qsort(t-entry, t-nb_entries, sizeof(*t-entry), cmp_dict_entries);
+   t-hash_size = (t-nb_entries * 4 / 3) / 2;
+   rehash_entries(t);
+}
+
+void dict_dump(struct dict_table *t)
+{
+   int i;
+
+   sort_dict_entries_by_hits(t);
+   for (i = 0; i  t-nb_entries; i++)
+   printf(%d\t%s\n,
+   t-entry[i].hits,
+   t-data + t-entry[i].offset);
+}
-- 
1.8.4.22.g54757b7

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html