Re: [PATCH v3 4/4] read-cache: speed up index load through parallelization

2018-09-07 Thread Ben Peart




On 9/7/2018 12:16 AM, Torsten Bögershausen wrote:



diff --git a/read-cache.c b/read-cache.c
index fcc776aaf0..8537a55750 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
return NULL;
  }
  
+/*

+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+   struct mem_pool *ce_mem_pool, int offset, int nr, void 
*mmap,
+   unsigned long start_offset, struct strbuf 
*previous_name)
+{
+   int i;
+   unsigned long src_offset = start_offset;


I read an unsigned long here:
should that be a size_t instead ?

(And probably even everywhere else in this patch)



It's a fair question.  The pre-patch code had a mix of unsigned long and 
size_t.  Both src_offset and consumed were unsigned long but mmap_size 
was a size_t.  I stuck with that pattern for consistency.


While it would be possible to convert everything to size_t as a step to 
enable index files >4 GB, I have a hard time believing that will be 
necessary for a very long time and would likely require more substantial 
changes to enable that to work.



+
+   for (i = offset; i < offset + nr; i++) {
+   struct ondisk_cache_entry *disk_ce;
+   struct cache_entry *ce;
+   unsigned long consumed;
+
+   disk_ce = (struct ondisk_cache_entry *)((char *)mmap + 
src_offset);
+   ce = create_from_disk(ce_mem_pool, disk_ce, , 
previous_name);
+   set_index_entry(istate, i, ce);
+
+   src_offset += consumed;
+   }
+   return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+   void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+   struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+   unsigned long consumed;
+
+   if (istate->version == 4) {
+   previous_name = _name_buf;
+   mem_pool_init(>ce_mem_pool,
+   
estimate_cache_size_from_compressed(istate->cache_nr));
+   } else {
+   previous_name = NULL;
+   mem_pool_init(>ce_mem_pool,
+   estimate_cache_size(mmap_size, 
istate->cache_nr));
+   }
+
+   consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+   0, istate->cache_nr, mmap, src_offset, 
previous_name);
+   strbuf_release(_name_buf);
+   return consumed;
+}
+
+#ifndef NO_PTHREADS
+


Re: [PATCH v3 4/4] read-cache: speed up index load through parallelization

2018-09-06 Thread Torsten Bögershausen


> diff --git a/read-cache.c b/read-cache.c
> index fcc776aaf0..8537a55750 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
>   return NULL;
>  }
>  
> +/*
> + * A helper function that will load the specified range of cache entries
> + * from the memory mapped file and add them to the given index.
> + */
> +static unsigned long load_cache_entry_block(struct index_state *istate,
> + struct mem_pool *ce_mem_pool, int offset, int nr, void 
> *mmap,
> + unsigned long start_offset, struct strbuf 
> *previous_name)
> +{
> + int i;
> + unsigned long src_offset = start_offset;

I read an unsigned long here:
should that be a size_t instead ?

(And probably even everywhere else in this patch)

> +
> + for (i = offset; i < offset + nr; i++) {
> + struct ondisk_cache_entry *disk_ce;
> + struct cache_entry *ce;
> + unsigned long consumed;
> +
> + disk_ce = (struct ondisk_cache_entry *)((char *)mmap + 
> src_offset);
> + ce = create_from_disk(ce_mem_pool, disk_ce, , 
> previous_name);
> + set_index_entry(istate, i, ce);
> +
> + src_offset += consumed;
> + }
> + return src_offset - start_offset;
> +}
> +
> +static unsigned long load_all_cache_entries(struct index_state *istate,
> + void *mmap, size_t mmap_size, unsigned long src_offset)
> +{
> + struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
> + unsigned long consumed;
> +
> + if (istate->version == 4) {
> + previous_name = _name_buf;
> + mem_pool_init(>ce_mem_pool,
> + 
> estimate_cache_size_from_compressed(istate->cache_nr));
> + } else {
> + previous_name = NULL;
> + mem_pool_init(>ce_mem_pool,
> + estimate_cache_size(mmap_size, 
> istate->cache_nr));
> + }
> +
> + consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
> + 0, istate->cache_nr, mmap, src_offset, 
> previous_name);
> + strbuf_release(_name_buf);
> + return consumed;
> +}
> +
> +#ifndef NO_PTHREADS
> +


[PATCH v3 4/4] read-cache: speed up index load through parallelization

2018-09-06 Thread Ben Peart
This patch helps address the CPU cost of loading the index by creating
multiple threads to divide the work of loading and converting the cache
entries across all available CPU cores.

It accomplishes this by having the primary thread loop across the index file
tracking the offset and (for V4 indexes) expanding the name. It creates a
thread to process each block of entries as it comes to them.

I used p0002-read-cache.sh to generate some performance data:

p0002-read-cache.sh w/100,000 files
Baseline   Thread entries
--
20.71(0.03+0.03)   13.93(0.04+0.04) -32.7%

p0002-read-cache.sh w/1,000,000 files
BaselineThread entries
---
217.60(0.03+0.04)   199.00(0.00+0.10) -8.6%

Signed-off-by: Ben Peart 
---
 read-cache.c | 242 +--
 t/README |   6 ++
 2 files changed, 220 insertions(+), 28 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index fcc776aaf0..8537a55750 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1941,20 +1941,212 @@ static void *load_index_extensions(void *_data)
return NULL;
 }
 
+/*
+ * A helper function that will load the specified range of cache entries
+ * from the memory mapped file and add them to the given index.
+ */
+static unsigned long load_cache_entry_block(struct index_state *istate,
+   struct mem_pool *ce_mem_pool, int offset, int nr, void 
*mmap,
+   unsigned long start_offset, struct strbuf 
*previous_name)
+{
+   int i;
+   unsigned long src_offset = start_offset;
+
+   for (i = offset; i < offset + nr; i++) {
+   struct ondisk_cache_entry *disk_ce;
+   struct cache_entry *ce;
+   unsigned long consumed;
+
+   disk_ce = (struct ondisk_cache_entry *)((char *)mmap + 
src_offset);
+   ce = create_from_disk(ce_mem_pool, disk_ce, , 
previous_name);
+   set_index_entry(istate, i, ce);
+
+   src_offset += consumed;
+   }
+   return src_offset - start_offset;
+}
+
+static unsigned long load_all_cache_entries(struct index_state *istate,
+   void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+   struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+   unsigned long consumed;
+
+   if (istate->version == 4) {
+   previous_name = _name_buf;
+   mem_pool_init(>ce_mem_pool,
+   
estimate_cache_size_from_compressed(istate->cache_nr));
+   } else {
+   previous_name = NULL;
+   mem_pool_init(>ce_mem_pool,
+   estimate_cache_size(mmap_size, 
istate->cache_nr));
+   }
+
+   consumed = load_cache_entry_block(istate, istate->ce_mem_pool,
+   0, istate->cache_nr, mmap, src_offset, 
previous_name);
+   strbuf_release(_name_buf);
+   return consumed;
+}
+
+#ifndef NO_PTHREADS
+
+/*
+ * Mostly randomly chosen maximum thread counts: we
+ * cap the parallelism to online_cpus() threads, and we want
+ * to have at least 10 cache entries per thread for it to
+ * be worth starting a thread.
+ */
+#define THREAD_COST(1)
+
+struct load_cache_entries_thread_data
+{
+   pthread_t pthread;
+   struct index_state *istate;
+   struct mem_pool *ce_mem_pool;
+   int offset, nr;
+   void *mmap;
+   unsigned long start_offset;
+   struct strbuf previous_name_buf;
+   struct strbuf *previous_name;
+   unsigned long consumed; /* return # of bytes in index file processed */
+};
+
+/*
+ * A thread proc to run the load_cache_entries() computation
+ * across multiple background threads.
+ */
+static void *load_cache_entries_thread(void *_data)
+{
+   struct load_cache_entries_thread_data *p = _data;
+
+   p->consumed += load_cache_entry_block(p->istate, p->ce_mem_pool,
+   p->offset, p->nr, p->mmap, p->start_offset, p->previous_name);
+   return NULL;
+}
+
+static unsigned long load_cache_entries_threaded(int nr_threads, struct 
index_state *istate,
+   void *mmap, size_t mmap_size, unsigned long src_offset)
+{
+   struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
+   struct load_cache_entries_thread_data *data;
+   int ce_per_thread;
+   unsigned long consumed;
+   int i, thread;
+
+   /* a little sanity checking */
+   if (istate->name_hash_initialized)
+   BUG("the name hash isn't thread safe");
+
+   mem_pool_init(>ce_mem_pool, 0);
+   if (istate->version == 4)
+   previous_name = _name_buf;
+   else
+   previous_name = NULL;
+
+   ce_per_thread = DIV_ROUND_UP(istate->cache_nr, nr_threads);
+   data = xcalloc(nr_threads, sizeof(struct 
load_cache_entries_thread_data));
+
+   /*
+* Loop through index