Am 02.02.2018 um 23:36 schrieb Jonathan Tan:
> Subsequent patches will introduce file formats that make use of a fanout
> array and a sorted table containing hashes, just like packfiles.
> Refactor the hash search in packfile.c into its own function, so that
> those patches can make use of it as well.
> 
> Signed-off-by: Jonathan Tan <jonathanta...@google.com>
> ---
>   packfile.c    | 19 +++++--------------
>   sha1-lookup.c | 24 ++++++++++++++++++++++++
>   sha1-lookup.h | 21 +++++++++++++++++++++
>   3 files changed, 50 insertions(+), 14 deletions(-)
> 
> diff --git a/packfile.c b/packfile.c
> index 58bdced3b..29f5dc239 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1712,7 +1712,8 @@ off_t find_pack_entry_one(const unsigned char *sha1,
>   {
>       const uint32_t *level1_ofs = p->index_data;
>       const unsigned char *index = p->index_data;
> -     unsigned hi, lo, stride;
> +     unsigned stride;
> +     int ret;
>   
>       if (!index) {
>               if (open_pack_index(p))
> @@ -1725,8 +1726,6 @@ off_t find_pack_entry_one(const unsigned char *sha1,
>               index += 8;
>       }
>       index += 4 * 256;
> -     hi = ntohl(level1_ofs[*sha1]);
> -     lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
>       if (p->index_version > 1) {
>               stride = 20;
>       } else {
> @@ -1734,17 +1733,9 @@ off_t find_pack_entry_one(const unsigned char *sha1,
>               index += 4;
>       }
>   
> -     while (lo < hi) {
> -             unsigned mi = lo + (hi - lo) / 2;
> -             int cmp = hashcmp(index + mi * stride, sha1);
> -
> -             if (!cmp)
> -                     return nth_packed_object_offset(p, mi);
> -             if (cmp > 0)
> -                     hi = mi;
> -             else
> -                     lo = mi+1;
> -     }
> +     ret = bsearch_hash(sha1, level1_ofs, index, stride);
> +     if (ret >= 0)
> +             return nth_packed_object_offset(p, ret);

Going from unsigned to signed int means the patch breaks support for
more than 2G pack entries, which was put with 326bf39677 (Use uint32_t
for all packed object counts.) in 2007.

>       return 0;
>   }
>   
> diff --git a/sha1-lookup.c b/sha1-lookup.c
> index 4cf3ebd92..d11c5e526 100644
> --- a/sha1-lookup.c
> +++ b/sha1-lookup.c
> @@ -99,3 +99,27 @@ int sha1_pos(const unsigned char *sha1, void *table, 
> size_t nr,
>       } while (lo < hi);
>       return -lo-1;
>   }
> +
> +int bsearch_hash(const unsigned char *sha1, const void *fanout_,
> +              const void *table_, size_t stride)
> +{
> +     const uint32_t *fanout = fanout_;

Why hide the type?  It doesn't make the function more generic.

> +     const unsigned char *table = table_;
> +     int hi, lo;
> +
> +     hi = ntohl(fanout[*sha1]);
> +     lo = ((*sha1 == 0x0) ? 0 : ntohl(fanout[*sha1 - 1]));
> +
> +     while (lo < hi) {
> +             unsigned mi = lo + (hi - lo) / 2;
> +             int cmp = hashcmp(table + mi * stride, sha1);
> +
> +             if (!cmp)
> +                     return mi;
> +             if (cmp > 0)
> +                     hi = mi;
> +             else
> +                     lo = mi + 1;
> +     }
> +     return -lo - 1;
> +}
> diff --git a/sha1-lookup.h b/sha1-lookup.h
> index cf5314f40..3c59e9cb1 100644
> --- a/sha1-lookup.h
> +++ b/sha1-lookup.h
> @@ -7,4 +7,25 @@ extern int sha1_pos(const unsigned char *sha1,
>                   void *table,
>                   size_t nr,
>                   sha1_access_fn fn);
> +
> +/*
> + * Searches for sha1 in table, using the given fanout table to determine the
> + * interval to search, then using binary search. Returns the element index of
> + * the position found if successful, -i-1 if not (where i is the index of the
> + * least element that is greater than sha1).
> + *
> + * Takes the following parameters:
> + *
> + *  - sha1: the hash to search for
> + *  - fanout: a 256-element array of NETWORK-order 32-bit integers; the 
> integer
> + *    at position i represents the number of elements in table whose first 
> byte
> + *    is less than or equal to i
> + *  - table: a sorted list of hashes with optional extra information in 
> between
> + *  - stride: distance between two consecutive elements in table (should be
> + *    GIT_MAX_RAWSZ or greater)
> + *
> + * This function does not verify the validity of the fanout table.
> + */
> +extern int bsearch_hash(const unsigned char *sha1, const void *fanout,
> +                     const void *table, size_t stride);
>   #endif
> 

Why not use sha1_pos()?  I guess because it avoids the overhead of the
accessor function, right?  And I wonder how much of difference it makes.

A binary search function for embedded hashes just needs the key, a
pointer to the first hash in the array, the stride and the number of
elements.  It can then be used with or without a fanout table, making it
more versatile.  Just a thought.

René

Reply via email to