Hi,
On Thu, Sep 05, 2013 at 02:19:28AM -0400, Nicolas Pitre wrote:
> Let's create another dictionary table to hold the author and committer
> entries. We use the same table format used for tree entries where the
> 16 bit data prefix is conveniently used to store the timezone value.
>
> In order to copy straight from a commit object buffer, dict_add_entry()
> is modified to get the string length as the provided string pointer is
> not always be null terminated.
>
> Signed-off-by: Nicolas Pitre
> ---
> packv4-create.c | 98
> +++--
> 1 file changed, 89 insertions(+), 9 deletions(-)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index eccd9fc..5c08871 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -1,5 +1,5 @@
> /*
> - * packv4-create.c: management of dictionary tables used in pack v4
> + * packv4-create.c: creation of dictionary tables and objects used in pack v4
> *
> * (C) Nicolas Pitre
> *
> @@ -80,9 +80,9 @@ static void rehash_entries(struct dict_table *t)
> }
> }
>
> -int dict_add_entry(struct dict_table *t, int val, const char *str)
> +int dict_add_entry(struct dict_table *t, int val, const char *str, int
> str_len)
> {
> - int i, val_len = 2, str_len = strlen(str) + 1;
> + int i, val_len = 2;
>
> if (t->ptr + val_len + str_len > t->size) {
We need a +1 here on the left side, i.e.
if (t->ptr + val_len + str_len + 1 > t->size) {
The str_len variable accounted for the terminating null character
before, but this patch removes str_len = strlen(str) + 1; above, and
callers specify the length of str without the terminating null in
str_len. Thus it can lead to memory corruption, when the new entry
happens to end at 't->ptr + val_len + str_len' and the line added in
the next hunk writes the terminating null beyond the end of the
buffer. I couldn't create a v4 pack from a current linux repo because
of this; either glibc detected something or 'git packv4-create'
crashed.
Sidenote: couldn't we call the 'ptr' field something else, like
end_offset or end_idx? It took me some headscratching to figure out
why is it OK to compare a pointer to an integer above, or use a
pointer without dereferencing as an index into an array below (because
ptr is, well, not a pointer after all).
> t->size = (t->size + val_len + str_len + 1024) * 3 / 2;
> @@ -92,6 +92,7 @@ int dict_add_entry(struct dict_table *t, int val, const
> char *str)
> t->data[t->ptr] = val >> 8;
> t->data[t->ptr + 1] = val;
> memcpy(t->data + t->ptr + val_len, str, str_len);
> + t->data[t->ptr + val_len + str_len] = 0;
>
> i = (t->nb_entries) ?
> locate_entry(t, t->data + t->ptr, val_len + str_len) : -1;
> @@ -107,7 +108,7 @@ int dict_add_entry(struct dict_table *t, int val, const
> char *str)
> t->entry[t->nb_entries].offset = t->ptr;
> t->entry[t->nb_entries].size = val_len + str_len;
> t->entry[t->nb_entries].hits = 1;
> - t->ptr += val_len + str_len;
> + t->ptr += val_len + str_len + 1;
Good.
Best,
Gábor
> t->nb_entries++;
>
> if (t->hash_size * 3 <= t->nb_entries * 4)
> @@ -135,8 +136,73 @@ static void sort_dict_entries_by_hits(struct dict_table
> *t)
> rehash_entries(t);
> }
>
> +static struct dict_table *commit_name_table;
> static struct dict_table *tree_path_table;
>
> +/*
> + * Parse the author/committer line from a canonical commit object.
> + * The 'from' argument points right after the "author " or "committer "
> + * string. The time zone is parsed and stored in *tz_val. The returned
> + * pointer is right after the end of the email address which is also just
> + * before the time value, or NULL if a parsing error is encountered.
> + */
> +static char *get_nameend_and_tz(char *from, int *tz_val)
> +{
> + char *end, *tz;
> +
> + tz = strchr(from, '\n');
> + /* let's assume the smallest possible string to be "x 0 +\n" */
> + if (!tz || tz - from < 13)
> + return NULL;
> + tz -= 4;
> + end = tz - 4;
> + while (end - from > 5 && *end != ' ')
> + end--;
> + if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
> + return NULL;
> + *tz_val = (tz[0] - '0') * 1000 +
> + (tz[1] - '0') * 100 +
> + (tz[2] - '0') * 10 +
> + (tz[3] - '0');
> + switch (tz[-1]) {
> + default:return NULL;
> + case '+': break;
> + case '-': *tz_val = -*tz_val;
> + }
> + return end;
> +}
> +
> +static int add_commit_dict_entries(void *buf, unsigned long size)
> +{
> + char *name, *end = NULL;
> + int tz_val;
> +
> + if (!commit_name_table)
> + commit_name_table = create_dict_table();
> +
> + /* parse and add author info */
> + name = strstr(buf, "\nauthor ");
> + if (name) {
> + name += 8;
> + end = get_nameend_and_tz(