Hello Kumagai,

I test it, and it works well. The following is the results.

in virtual machine(memory 2G):
with empty memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    12.76
            devel      |       1       |    19.29
            devel      |       2       |    11.56
        + this patch   |       0       |    12.85
        + this patch   |       1       |     5.61
        + this patch   |       2       |     2.68

with full memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    51.18
            devel      |       1       |    57.82
            devel      |       2       |    41.54
        + this patch   |       0       |    49.25
        + this patch   |       1       |    44.80
        + this patch   |       2       |    33.87


in real machine(memory 16G):
with empty memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    86.12
            devel      |       1       |   222.37
            devel      |       8       |    81.50
            devel      |       16      |    98.44
        + this patch   |       0       |    86.07
        + this patch   |       1       |    84.33
        + this patch   |       8       |    14.95
        + this patch   |       16      |    13.96

with full memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |   540.89
            devel      |       1       |   715.25
            devel      |       8       |   132.54
            devel      |       16      |   112.89
        + this patch   |       0       |   542.79
        + this patch   |       1       |   538.22
        + this patch   |       8       |   108.28
        + this patch   |       16      |   107.83

-- 
Thanks
Zhou

On 10/14/2015 01:24 PM, Atsushi Kumagai wrote:
> Hello,
> 
> I have improved the performance issue of parallel compression
> which we faced in:
> 
>    http://lists.infradead.org/pipermail/kexec/2015-July/014137.html
> 
> The cause of the issue is that compress2() calls malloc() and free()
> for a temp buffer in each call, it can cause many page faults since
> makedumpfile has to call compress2() for each page.
> 
> It's easy to avoid the issue, just divide compress2() into three
> functions as initialization part, compression part and finalization
> part. Then we don't need to call the initialization function and the
> finalization function for each page.
> 
> In order to benchmark, I measured the execution time and the number of
> page faults by *perf stat -e page-faults* on the current devel 
> branch(v1.5.8+).
> 
> The result is here:
> 
>    CPU:   Dual-Core AMD Opteron(tm) Processor 2218 @ 2.6GHz (4 cores)
>    Memory:  5GB
>    zlib:  1.2.3-29
>    glibc: 2.12-1.132
> 
>          version      |  num-threads  |  time(sec)  |   page-faults
>       ----------------+---------------+-------------+------------------
>            devel      |       1       |   133.96    |    21,801,120
>            devel      |       3       |    87.25    |    21,801,150
>        + this patch   |       1       |    47.80    |     1,036,408
>        + this patch   |       3       |    39.14    |     1,036,478
> 
> 
> Thanks
> Atsushi Kumagai
> 
> 
> From: Atsushi Kumagai <[email protected]>
> Date: Thu, 8 Oct 2015 15:06:08 +0900
> Subject: [PATCH] Improve performance for parallel compression with zlib.
> 
> compress2() allocates a buffer, compresses a input data and
> deallocates the buffer in each call. makedumpfile has to call
> compress2() for each page, it can cause big performance
> degradation due to many page faults. This issue will be
> especially apparent in the case of multi thread compression
> since per-thread arena is easy to be grown and trimmed compared
> with main arena.
> 
> Fortunately, the zlib functions called in compress2() are global,
> it's easy to extract the allocation and deallocation part from
> compress2().
> 
> Signed-off-by: Atsushi Kumagai <[email protected]>
> ---
>   makedumpfile.c | 57 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   makedumpfile.h |  4 ++++
>   2 files changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 06c8baf..fa0b779 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -25,6 +25,7 @@
>   #include <sys/time.h>
>   #include <limits.h>
>   #include <assert.h>
> +#include <zlib.h>
>   
>   struct symbol_table symbol_table;
>   struct size_table   size_table;
> @@ -3538,6 +3539,11 @@ initial_for_parallel()
>               MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
>               MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
>   
> +             if (initialize_zlib(&ZLIB_STREAM_PARALLEL(i), Z_BEST_SPEED) == 
> FALSE) {
> +                     ERRMSG("zlib initialization failed.\n");
> +                     return FALSE;
> +             }
> +
>   #ifdef USELZO
>               if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == 
> NULL) {
>                       MSG("Can't allocate memory for the working memory. 
> %s\n",
> @@ -3628,6 +3634,7 @@ free_for_parallel()
>   
>                               free(MMAP_CACHE_PARALLEL(i));
>                       }
> +                     finalize_zlib(&ZLIB_STREAM_PARALLEL(i));
>   #ifdef USELZO
>                       if (WRKMEM_PARALLEL(i) != NULL)
>                               free(WRKMEM_PARALLEL(i));
> @@ -7017,6 +7024,53 @@ write_kdump_page(struct cache_data *cd_header, struct 
> cache_data *cd_page,
>       return TRUE;
>   }
>   
> +int initialize_zlib(z_stream *stream, int level)
> +{
> +     int err;
> +
> +     stream->zalloc = (alloc_func)Z_NULL;
> +     stream->zfree = (free_func)Z_NULL;
> +     stream->opaque = (voidpf)Z_NULL;
> +
> +     err = deflateInit(stream, level);
> +     if (err != Z_OK) {
> +             ERRMSG("deflateInit failed: %s\n", zError(err));
> +             return FALSE;
> +     }
> +     return TRUE;
> +}
> +
> +int compress_mdf (z_stream *stream, Bytef *dest, uLongf *destLen,
> +               const Bytef *source, uLong sourceLen, int level)
> +{
> +     int err;
> +     stream->next_in = (Bytef*)source;
> +     stream->avail_in = (uInt)sourceLen;
> +     stream->next_out = dest;
> +     stream->avail_out = (uInt)*destLen;
> +     if ((uLong)stream->avail_out != *destLen)
> +             return Z_BUF_ERROR;
> +
> +     err = deflate(stream, Z_FINISH);
> +
> +     if (err != Z_STREAM_END) {
> +             deflateReset(stream);
> +             return err == Z_OK ? Z_BUF_ERROR : err;
> +     }
> +     *destLen = stream->total_out;
> +
> +     err = deflateReset(stream);
> +     return err;
> +}
> +
> +int finalize_zlib(z_stream *stream)
> +{
> +     int err;
> +     err = deflateEnd(stream);
> +
> +     return err;
> +}
> +
>   void *
>   kdump_thread_function_cyclic(void *arg) {
>       void *retval = PTHREAD_FAIL;
> @@ -7035,6 +7089,7 @@ kdump_thread_function_cyclic(void *arg) {
>       struct mmap_cache *mmap_cache =
>                       MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
>       unsigned long size_out;
> +     z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num);
>   #ifdef USELZO
>       lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
>   #endif
> @@ -7135,7 +7190,7 @@ kdump_thread_function_cyclic(void *arg) {
>                       size_out = kdump_thread_args->len_buf_out;
>                       if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
>                           && ((size_out = kdump_thread_args->len_buf_out),
> -                             compress2(buf_out, &size_out, buf,
> +                             compress_mdf(stream, buf_out, &size_out, buf,
>                                         info->page_size,
>                                         Z_BEST_SPEED) == Z_OK)
>                           && (size_out < info->page_size)) {
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 0bd6425..cb8f0f3 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -438,6 +438,7 @@ do { \
>   #define BUF_PARALLEL(i)                     info->parallel_info[i].buf
>   #define BUF_OUT_PARALLEL(i)         info->parallel_info[i].buf_out
>   #define MMAP_CACHE_PARALLEL(i)              
> info->parallel_info[i].mmap_cache
> +#define ZLIB_STREAM_PARALLEL(i)              
> info->parallel_info[i].zlib_stream
>   #ifdef USELZO
>   #define WRKMEM_PARALLEL(i)          info->parallel_info[i].wrkmem
>   #endif
> @@ -1050,6 +1051,7 @@ struct parallel_info {
>       unsigned char           *buf;
>       unsigned char           *buf_out;
>       struct mmap_cache       *mmap_cache;
> +     z_stream                zlib_stream;
>   #ifdef USELZO
>       lzo_bytep               wrkmem;
>   #endif
> @@ -2051,5 +2053,7 @@ int initial_xen(void);
>   unsigned long long get_free_memory_size(void);
>   int calculate_cyclic_buffer_size(void);
>   int prepare_splitblock_table(void);
> +int initialize_zlib(z_stream *stream, int level);
> +int finalize_zlib(z_stream *stream);
>   
>   #endif /* MAKEDUMPFILE_H */
> 

_______________________________________________
kexec mailing list
[email protected]
http://lists.infradead.org/mailman/listinfo/kexec

Reply via email to