Re: [PATCH 01/23] pack v4: initial pack dictionary structure and code

2013-08-27 Thread Nicolas Pitre
On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre  writes:
> 
> > Signed-off-by: Nicolas Pitre 
> > ---
> 
> Was there a reason not to reuse the hash-table Linus did in
> hash.[ch]?

Well... Most likely because when I started that code (which used to be 
quite different initially) it might not have been served correctly by 
hash.c, or any other reasons I long have forgotten by now which might or 
might not still be valid.

> It may not make much of a difference for something so small and
> isolated from the rest of the system, but if hash.[ch] can be easily
> fixed with a small tweak to suit the use by this subsystem better,
> it might be worth reusing the existing code with improvement, which
> may help other potential users.

Absolutely.  If someone wants to give a hand in that direction I'll 
happily integrate patches into my series.  I cannot promise I'll do the 
work myself as I prefer spending the time I have available on actually 
making pack v4 usable.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 01/23] pack v4: initial pack dictionary structure and code

2013-08-27 Thread Junio C Hamano
Nicolas Pitre  writes:

> Signed-off-by: Nicolas Pitre 
> ---

Was there a reason not to reuse the hash-table Linus did in
hash.[ch]?

It may not make much of a difference for something so small and
isolated from the rest of the system, but if hash.[ch] can be easily
fixed with a small tweak to suit the use by this subsystem better,
it might be worth reusing the existing code with improvement, which
may help other potential users.

>  packv4-create.c | 137 
> 
>  1 file changed, 137 insertions(+)
>  create mode 100644 packv4-create.c
>
> diff --git a/packv4-create.c b/packv4-create.c
> new file mode 100644
> index 000..2de6d41
> --- /dev/null
> +++ b/packv4-create.c
> @@ -0,0 +1,137 @@
> +/*
> + * packv4-create.c: management of dictionary tables used in pack v4
> + *
> + * (C) Nicolas Pitre 
> + *
> + * This code is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include "cache.h"
> +
> +struct data_entry {
> + unsigned offset;
> + unsigned hits;
> +};
> +
> +struct dict_table {
> + char *data;
> + unsigned ptr;
> + unsigned size;
> + struct data_entry *entry;
> + unsigned nb_entries;
> + unsigned max_entries;
> + unsigned *hash;
> + unsigned hash_size;
> +};
> +
> +struct dict_table *create_dict_table(void)
> +{
> + return xcalloc(sizeof(struct dict_table), 1);
> +}
> +
> +void destroy_dict_table(struct dict_table *t)
> +{
> + free(t->data);
> + free(t->entry);
> + free(t->hash);
> + free(t);
> +}
> +
> +static int locate_entry(struct dict_table *t, const char *str)
> +{
> + int i = 0;
> + const unsigned char *s = (const unsigned char *) str;
> +
> + while (*s)
> + i = i * 111 + *s++;
> + i = (unsigned)i % t->hash_size;
> +
> + while (t->hash[i]) {
> + unsigned n = t->hash[i] - 1;
> + if (!strcmp(str, t->data + t->entry[n].offset))
> + return n;
> + if (++i >= t->hash_size)
> + i = 0;
> + }
> + return -1 - i;
> +}
> +
> +static void rehash_entries(struct dict_table *t)
> +{
> + unsigned n;
> +
> + t->hash_size *= 2;
> + if (t->hash_size < 1024)
> + t->hash_size = 1024;
> + t->hash = xrealloc(t->hash, t->hash_size * sizeof(*t->hash));
> + memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
> +
> + for (n = 0; n < t->nb_entries; n++) {
> + int i = locate_entry(t, t->data + t->entry[n].offset);
> + if (i < 0)
> + t->hash[-1 - i] = n + 1;
> + }
> +}
> +
> +int dict_add_entry(struct dict_table *t, const char *str)
> +{
> + int i, len = strlen(str) + 1;
> +
> + if (t->ptr + len >= t->size) {
> + t->size = (t->size + len + 1024) * 3 / 2;
> + t->data = xrealloc(t->data, t->size);
> + }
> + memcpy(t->data + t->ptr, str, len);
> +
> + i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
> + if (i >= 0) {
> + t->entry[i].hits++;
> + return i;
> + }
> +
> + if (t->nb_entries >= t->max_entries) {
> + t->max_entries = (t->max_entries + 1024) * 3 / 2;
> + t->entry = xrealloc(t->entry, t->max_entries * 
> sizeof(*t->entry));
> + }
> + t->entry[t->nb_entries].offset = t->ptr;
> + t->entry[t->nb_entries].hits = 1;
> + t->ptr += len + 1;
> + t->nb_entries++;
> +
> + if (t->hash_size * 3 <= t->nb_entries * 4)
> + rehash_entries(t);
> + else
> + t->hash[-1 - i] = t->nb_entries;
> +
> + return t->nb_entries - 1;
> +}
> +
> +static int cmp_dict_entries(const void *a_, const void *b_)
> +{
> + const struct data_entry *a = a_;
> + const struct data_entry *b = b_;
> + int diff = b->hits - a->hits;
> + if (!diff)
> + diff = a->offset - b->offset;
> + return diff;
> +}
> +
> +static void sort_dict_entries_by_hits(struct dict_table *t)
> +{
> + qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
> + t->hash_size = (t->nb_entries * 4 / 3) / 2;
> + rehash_entries(t);
> +}
> +
> +void dict_dump(struct dict_table *t)
> +{
> + int i;
> +
> + sort_dict_entries_by_hits(t);
> + for (i = 0; i < t->nb_entries; i++)
> + printf("%d\t%s\n",
> + t->entry[i].hits,
> + t->data + t->entry[i].offset);
> +}
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/23] pack v4: initial pack dictionary structure and code

2013-08-26 Thread Nicolas Pitre
Signed-off-by: Nicolas Pitre 
---
 packv4-create.c | 137 
 1 file changed, 137 insertions(+)
 create mode 100644 packv4-create.c

diff --git a/packv4-create.c b/packv4-create.c
new file mode 100644
index 000..2de6d41
--- /dev/null
+++ b/packv4-create.c
@@ -0,0 +1,137 @@
+/*
+ * packv4-create.c: management of dictionary tables used in pack v4
+ *
+ * (C) Nicolas Pitre 
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+
+struct data_entry {
+   unsigned offset;
+   unsigned hits;
+};
+
+struct dict_table {
+   char *data;
+   unsigned ptr;
+   unsigned size;
+   struct data_entry *entry;
+   unsigned nb_entries;
+   unsigned max_entries;
+   unsigned *hash;
+   unsigned hash_size;
+};
+
+struct dict_table *create_dict_table(void)
+{
+   return xcalloc(sizeof(struct dict_table), 1);
+}
+
+void destroy_dict_table(struct dict_table *t)
+{
+   free(t->data);
+   free(t->entry);
+   free(t->hash);
+   free(t);
+}
+
+static int locate_entry(struct dict_table *t, const char *str)
+{
+   int i = 0;
+   const unsigned char *s = (const unsigned char *) str;
+
+   while (*s)
+   i = i * 111 + *s++;
+   i = (unsigned)i % t->hash_size;
+
+   while (t->hash[i]) {
+   unsigned n = t->hash[i] - 1;
+   if (!strcmp(str, t->data + t->entry[n].offset))
+   return n;
+   if (++i >= t->hash_size)
+   i = 0;
+   }
+   return -1 - i;
+}
+
+static void rehash_entries(struct dict_table *t)
+{
+   unsigned n;
+
+   t->hash_size *= 2;
+   if (t->hash_size < 1024)
+   t->hash_size = 1024;
+   t->hash = xrealloc(t->hash, t->hash_size * sizeof(*t->hash));
+   memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
+
+   for (n = 0; n < t->nb_entries; n++) {
+   int i = locate_entry(t, t->data + t->entry[n].offset);
+   if (i < 0)
+   t->hash[-1 - i] = n + 1;
+   }
+}
+
+int dict_add_entry(struct dict_table *t, const char *str)
+{
+   int i, len = strlen(str) + 1;
+
+   if (t->ptr + len >= t->size) {
+   t->size = (t->size + len + 1024) * 3 / 2;
+   t->data = xrealloc(t->data, t->size);
+   }
+   memcpy(t->data + t->ptr, str, len);
+
+   i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
+   if (i >= 0) {
+   t->entry[i].hits++;
+   return i;
+   }
+
+   if (t->nb_entries >= t->max_entries) {
+   t->max_entries = (t->max_entries + 1024) * 3 / 2;
+   t->entry = xrealloc(t->entry, t->max_entries * 
sizeof(*t->entry));
+   }
+   t->entry[t->nb_entries].offset = t->ptr;
+   t->entry[t->nb_entries].hits = 1;
+   t->ptr += len + 1;
+   t->nb_entries++;
+
+   if (t->hash_size * 3 <= t->nb_entries * 4)
+   rehash_entries(t);
+   else
+   t->hash[-1 - i] = t->nb_entries;
+
+   return t->nb_entries - 1;
+}
+
+static int cmp_dict_entries(const void *a_, const void *b_)
+{
+   const struct data_entry *a = a_;
+   const struct data_entry *b = b_;
+   int diff = b->hits - a->hits;
+   if (!diff)
+   diff = a->offset - b->offset;
+   return diff;
+}
+
+static void sort_dict_entries_by_hits(struct dict_table *t)
+{
+   qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
+   t->hash_size = (t->nb_entries * 4 / 3) / 2;
+   rehash_entries(t);
+}
+
+void dict_dump(struct dict_table *t)
+{
+   int i;
+
+   sort_dict_entries_by_hits(t);
+   for (i = 0; i < t->nb_entries; i++)
+   printf("%d\t%s\n",
+   t->entry[i].hits,
+   t->data + t->entry[i].offset);
+}
-- 
1.8.4.22.g54757b7

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html