Include a decompression testcase for the powerpc NX-GZIP
engine.

Signed-off-by: Bulent Abali <ab...@us.ibm.com>
Signed-off-by: Raphael Moreira Zinsly <rzin...@linux.ibm.com>
---
 .../selftests/powerpc/nx-gzip/Makefile        |    7 +-
 .../selftests/powerpc/nx-gzip/gunz_test.c     | 1078 +++++++++++++++++
 2 files changed, 1082 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gunz_test.c

diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile 
b/tools/testing/selftests/powerpc/nx-gzip/Makefile
index ab903f63bbbd..82abc19a49a0 100644
--- a/tools/testing/selftests/powerpc/nx-gzip/Makefile
+++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
@@ -1,9 +1,9 @@
 CC = gcc
 CFLAGS = -O3
 INC = ./inc
-SRC = gzfht_test.c
+SRC = gzfht_test.c gunz_test.c
 OBJ = $(SRC:.c=.o)
-TESTS = gzfht_test
+TESTS = gzfht_test gunz_test
 EXTRA_SOURCES = gzip_vas.c
 
 all:   $(TESTS)
@@ -16,6 +16,7 @@ $(TESTS): $(OBJ)
 
 run_tests: $(TESTS)
        ./gzfht_test gzip_vas.c
+       ./gunz_test gzip_vas.c.nx.gz
 
 clean:
-       rm -f $(TESTS) *.o *~ *.gz
+       rm -f $(TESTS) *.o *~ *.gz *.gunzip
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c 
b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
new file mode 100644
index 000000000000..82eb268a8397
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
@@ -0,0 +1,1078 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* P9 gunzip sample code for demonstrating the P9 NX hardware
+ * interface.  Not intended for productive uses or for performance or
+ * compression ratio measurements.  Note also that /dev/crypto/gzip,
+ * VAS and skiboot support are required
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ * Author: Bulent Abali <ab...@us.ibm.com>
+ *
+ * https://github.com/libnxz/power-gzip for zlib api and other utils
+ * Definitions of acronyms used here.  See
+ * P9 NX Gzip Accelerator User's Manual for details:
+ * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce:       completion extension
+ * cpb:      coprocessor parameter block (metadata)
+ * crb:      coprocessor request block (command)
+ * csb:      coprocessor status block (status)
+ * dht:      dynamic huffman table
+ * dde:      data descriptor element (address, length)
+ * ddl:      list of ddes
+ * dh/fh:    dynamic and fixed huffman types
+ * fc:       coprocessor function code
+ * histlen:  history/dictionary length
+ * history:  sliding window of up to 32KB of data
+ * lzcount:  Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt:     source final block type; last block's type during decomp
+ * spbc:     source processed byte count
+ * subc:     source unprocessed bit count
+ * tebc:     target ending bit count; valid bits in the last byte
+ * tpbc:     target processed byte count
+ * vas:      virtual accelerator switch; the user mode interface
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "nxu.h"
+#include "nx.h"
+#include "crb.h"
+
+int nx_dbg;
+FILE *nx_gzip_log;
+
+#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
+#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
+
+#define GETINPC(X) fgetc(X)
+#define FNAME_MAX 1024
+
+/* fifo queue management */
+#define fifo_used_bytes(used) (used)
+#define fifo_free_bytes(used, len) ((len)-(used))
+/* amount of free bytes in the first and last parts */
+#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
+                                                 ? (len)-((cur)+(used)) : 0)
+#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
+                                                 ? (cur) : (len)-(used))
+/* amount of used bytes in the first and last parts */
+#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
+                                                 ? (used) : (len)-(cur))
+#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
+                                                 ? 0 : ((used)+(cur))-(len))
+/* first and last free parts start here */
+#define fifo_free_first_offset(cur, used)      ((cur)+(used))
+#define fifo_free_last_offset(cur, used, len)  \
+                                          fifo_used_last_bytes(cur, used, len)
+/* first and last used parts start here */
+#define fifo_used_first_offset(cur)            (cur)
+#define fifo_used_last_offset(cur)             (0)
+
+const int fifo_in_len = 1<<24;
+const int fifo_out_len = 1<<24;
+const int page_sz = 1<<16;
+const int line_sz = 1<<7;
+const int window_max = 1<<15;
+const int retry_max = 50;
+
+void *nx_fault_storage_address;
+
+/*
+ * Fault in pages prior to NX job submission.  wr=1 may be required to
+ * touch writeable pages.  System zero pages do not fault-in the page as
+ * intended.  Typically set wr=1 for NX target pages and set wr=0 for
+ * NX source pages.
+ */
+static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
+{
+       char *begin = buf;
+       char *end = (char *) buf + buf_len - 1;
+       char t;
+
+       assert(buf_len >= 0 && !!buf);
+
+       NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
+                       buf + buf_len, buf_len, wr));
+
+       if (buf_len <= 0 || buf == NULL)
+               return -1;
+
+       do {
+               t = *begin;
+               if (wr)
+                       *begin = t;
+               begin = begin + page_len;
+       } while (begin < end);
+
+       /* When buf_sz is small or buf tail is in another page. */
+       t = *end;
+       if (wr)
+               *end = t;
+
+       return 0;
+}
+
+void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
+{
+       fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
+              sig, info->si_code, info->si_addr);
+
+       nx_fault_storage_address = info->si_addr;
+}
+
+/*
+ * Adds an (address, len) pair to the list of ddes (ddl) and updates
+ * the base dde.  ddl[0] is the only dde in a direct dde which
+ * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
+ * the indirect (base) dde that points to a list of direct ddes.
+ * See Section 6.4 of the NX-gzip user manual for DDE description.
+ * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
+ * bytes in ddl.  Caller is responsible for allocting the array of
+ * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
+ * list, the ddl array must have N+1 entries minimum.
+ */
+static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr,
+                                       uint32_t len)
+{
+       uint32_t ddecnt;
+       uint32_t bytes;
+
+       if (addr == NULL && len == 0) {
+               clearp_dde(ddl);
+               return 0;
+       }
+
+       NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr,
+                       __func__, len));
+
+       /* Number of ddes in the dde list ; == 0 when it is a direct dde */
+       ddecnt = getpnn(ddl, dde_count);
+       bytes = getp32(ddl, ddebc);
+
+       if (ddecnt == 0 && bytes == 0) {
+               /* First dde is unused; make it a direct dde */
+               bytes = len;
+               putp32(ddl, ddebc, bytes);
+               putp64(ddl, ddead, (uint64_t) addr);
+       } else if (ddecnt == 0) {
+               /* Converting direct to indirect dde
+                * ddl[0] becomes head dde of ddl
+                * copy direct to indirect first.
+                */
+               ddl[1] = ddl[0];
+
+               /* Add the new dde next */
+               clear_dde(ddl[2]);
+               put32(ddl[2], ddebc, len);
+               put64(ddl[2], ddead, (uint64_t) addr);
+
+               /* Ddl head points to 2 direct ddes */
+               ddecnt = 2;
+               putpnn(ddl, dde_count, ddecnt);
+               bytes = bytes + len;
+               putp32(ddl, ddebc, bytes);
+               /* Pointer to the first direct dde */
+               putp64(ddl, ddead, (uint64_t) &ddl[1]);
+       } else {
+               /* Append a dde to an existing indirect ddl */
+               ++ddecnt;
+               clear_dde(ddl[ddecnt]);
+               put64(ddl[ddecnt], ddead, (uint64_t) addr);
+               put32(ddl[ddecnt], ddebc, len);
+
+               putpnn(ddl, dde_count, ddecnt);
+               bytes = bytes + len;
+               putp32(ddl, ddebc, bytes); /* byte sum of all dde */
+       }
+       return bytes;
+}
+
+/*
+ * Touch specified number of pages represented in number bytes
+ * beginning from the first buffer in a dde list.
+ * Do not touch the pages past buf_sz-th byte's page.
+ *
+ * Set buf_sz = 0 to touch all pages described by the ddep.
+ */
+static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz,
+                               int wr)
+{
+       uint32_t indirect_count;
+       uint32_t buf_len;
+       long total;
+       uint64_t buf_addr;
+       struct nx_dde_t *dde_list;
+       int i;
+
+       assert(!!ddep);
+
+       indirect_count = getpnn(ddep, dde_count);
+
+       NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__,
+                       indirect_count));
+       NXPRT(fprintf(stderr, "0x%lx\n", buf_sz));
+
+       if (indirect_count == 0) {
+               /* Direct dde */
+               buf_len = getp32(ddep, ddebc);
+               buf_addr = getp64(ddep, ddead);
+
+               NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
+                               buf_len, (void *)buf_addr));
+
+               if (buf_sz == 0)
+                       nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+               else
+                       nx_touch_pages((void *)buf_addr, NX_MIN(buf_len,
+                                       buf_sz), page_sz, wr);
+
+               return ERR_NX_OK;
+       }
+
+       /* Indirect dde */
+       if (indirect_count > MAX_DDE_COUNT)
+               return ERR_NX_EXCESSIVE_DDE;
+
+       /* First address of the list */
+       dde_list = (struct nx_dde_t *) getp64(ddep, ddead);
+
+       if (buf_sz == 0)
+               buf_sz = getp32(ddep, ddebc);
+
+       total = 0;
+       for (i = 0; i < indirect_count; i++) {
+               buf_len = get32(dde_list[i], ddebc);
+               buf_addr = get64(dde_list[i], ddead);
+               total += buf_len;
+
+               NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ",
+                               buf_len, (void *)buf_addr));
+               NXPRT(fprintf(stderr, "0x%lx\n", total));
+
+               /* Touching fewer pages than encoded in the ddebc */
+               if (total > buf_sz) {
+                       buf_len = NX_MIN(buf_len, total - buf_sz);
+                       nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+                       NXPRT(fprintf(stderr, "touch loop break len 0x%x ",
+                                     buf_len));
+                       NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr));
+                       break;
+               }
+               nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+       }
+       return ERR_NX_OK;
+}
+
+/*
+ * Src and dst buffers are supplied in scatter gather lists.
+ * NX function code and other parameters supplied in cmdp.
+ */
+static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst,
+                        struct nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+       int cc;
+       uint64_t csbaddr;
+
+       memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
+
+       cmdp->crb.source_dde = *src;
+       cmdp->crb.target_dde = *dst;
+
+       /* Status, output byte count in tpbc */
+       csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
+       put64(cmdp->crb, csb_address, csbaddr);
+
+       /* NX reports input bytes in spbc; cleared */
+       cmdp->cpb.out_spbc_comp_wrap = 0;
+       cmdp->cpb.out_spbc_comp_with_count = 0;
+       cmdp->cpb.out_spbc_decomp = 0;
+
+       /* Clear output */
+       put32(cmdp->cpb, out_crc, INIT_CRC);
+       put32(cmdp->cpb, out_adler, INIT_ADLER);
+
+       cc = nxu_run_job(cmdp, handle);
+
+       if (!cc)
+               cc = getnn(cmdp->crb.csb, csb_cc);      /* CC Table 6-8 */
+
+       return cc;
+}
+
+int decompress_file(int argc, char **argv, void *devhandle)
+{
+       FILE *inpf;
+       FILE *outf;
+
+       int c, expect, i, cc, rc = 0;
+       char gzfname[FNAME_MAX];
+
+       /* Queuing, file ops, byte counting */
+       char *fifo_in, *fifo_out;
+       int used_in, cur_in, used_out, cur_out, read_sz, n;
+       int first_free, last_free, first_used, last_used;
+       int first_offset, last_offset;
+       int write_sz, free_space, source_sz;
+       int source_sz_estimate, target_sz_estimate;
+       uint64_t last_comp_ratio; /* 1000 max */
+       uint64_t total_out;
+       int is_final, is_eof;
+
+       /* nx hardware */
+       int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
+       int history_len = 0;
+       struct nx_gzip_crb_cpb_t cmd, *cmdp;
+       struct nx_dde_t *ddl_in;
+       struct nx_dde_t dde_in[6] __aligned(128);
+       struct nx_dde_t *ddl_out;
+       struct nx_dde_t dde_out[6] __aligned(128);
+       int pgfault_retries;
+
+       /* when using mmap'ed files */
+       off_t input_file_offset;
+
+       if (argc > 2) {
+               fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
+               fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
+               return -1;
+       }
+
+       if (argc == 1) {
+               inpf = stdin;
+               outf = stdout;
+       } else if (argc == 2) {
+               char w[1024];
+               char *wp;
+
+               inpf = fopen(argv[1], "r");
+               if (inpf == NULL) {
+                       perror(argv[1]);
+                       return -1;
+               }
+
+               /* Make a new file name to write to.  Ignoring '.gz' */
+               wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1];
+               strcpy(w, wp);
+               strcat(w, ".nx.gunzip");
+
+               outf = fopen(w, "w");
+               if (outf == NULL) {
+                       perror(w);
+                       return -1;
+               }
+       }
+
+       /* Decode the gzip header */
+       c = GETINPC(inpf); expect = 0x1f; /* ID1 */
+       if (c != expect)
+               goto err1;
+
+       c = GETINPC(inpf); expect = 0x8b; /* ID2 */
+       if (c != expect)
+               goto err1;
+
+       c = GETINPC(inpf); expect = 0x08; /* CM */
+       if (c != expect)
+               goto err1;
+
+       int flg = GETINPC(inpf); /* FLG */
+
+       if (flg & 0xE0 || flg & 0x4 || flg == EOF)
+               goto err2;
+
+       fprintf(stderr, "gzHeader FLG %x\n", flg);
+
+       /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
+        * sample code.
+        */
+       for (i = 0; i < 6; i++) {
+               char tmp[10];
+
+               tmp[i] = GETINPC(inpf);
+               if (tmp[i] == EOF)
+                       goto err3;
+               fprintf(stderr, "%02x ", tmp[i]);
+               if (i == 5)
+                       fprintf(stderr, "\n");
+       }
+       fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
+
+       /* FNAME */
+       if (flg & 0x8) {
+               int k = 0;
+
+               do {
+                       c = GETINPC(inpf);
+                       if (c == EOF || k >= FNAME_MAX)
+                               goto err3;
+                       gzfname[k++] = c;
+               } while (c);
+               fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
+       }
+
+       /* FHCRC */
+       if (flg & 0x2) {
+               c = GETINPC(inpf);
+               if (c == EOF)
+                       goto err3;
+               c = GETINPC(inpf);
+               if (c == EOF)
+                       goto err3;
+               fprintf(stderr, "gzHeader FHCRC: ignored\n");
+       }
+
+       used_in = cur_in = used_out = cur_out = 0;
+       is_final = is_eof = 0;
+
+       /* Allocate one page larger to prevent page faults due to NX
+        * overfetching.
+        * Either do this (char*)(uintptr_t)aligned_alloc or use
+        * -std=c11 flag to make the int-to-pointer warning go away.
+        */
+       assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
+                                  fifo_in_len + page_sz)) != NULL);
+       assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
+                                  fifo_out_len + page_sz + line_sz)) != NULL);
+       /* Leave unused space due to history rounding rules */
+       fifo_out = fifo_out + line_sz;
+       nx_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
+
+       ddl_in  = &dde_in[0];
+       ddl_out = &dde_out[0];
+       cmdp = &cmd;
+       memset(&cmdp->crb, 0, sizeof(cmdp->crb));
+
+read_state:
+
+       /* Read from .gz file */
+
+       NXPRT(fprintf(stderr, "read_state:\n"));
+
+       if (is_eof != 0)
+               goto write_state;
+
+       /* We read in to fifo_in in two steps: first: read in to from
+        * cur_in to the end of the buffer.  last: if free space wrapped
+        * around, read from fifo_in offset 0 to offset cur_in.
+        */
+
+       /* Reset fifo head to reduce unnecessary wrap arounds */
+       cur_in = (used_in == 0) ? 0 : cur_in;
+
+       /* Free space total is reduced by a gap */
+       free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
+                           - line_sz);
+
+       /* Free space may wrap around as first and last */
+       first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
+       last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
+
+       /* Start offsets of the free memory */
+       first_offset = fifo_free_first_offset(cur_in, used_in);
+       last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
+
+       /* Reduce read_sz because of the line_sz gap */
+       read_sz = NX_MIN(free_space, first_free);
+       n = 0;
+       if (read_sz > 0) {
+               /* Read in to offset cur_in + used_in */
+               n = fread(fifo_in + first_offset, 1, read_sz, inpf);
+               used_in = used_in + n;
+               free_space = free_space - n;
+               assert(n <= read_sz);
+               if (n != read_sz) {
+                       /* Either EOF or error; exit the read loop */
+                       is_eof = 1;
+                       goto write_state;
+               }
+       }
+
+       /* If free space wrapped around */
+       if (last_free > 0) {
+               /* Reduce read_sz because of the line_sz gap */
+               read_sz = NX_MIN(free_space, last_free);
+               n = 0;
+               if (read_sz > 0) {
+                       n = fread(fifo_in + last_offset, 1, read_sz, inpf);
+                       used_in = used_in + n;       /* Increase used space */
+                       free_space = free_space - n; /* Decrease free space */
+                       assert(n <= read_sz);
+                       if (n != read_sz) {
+                               /* Either EOF or error; exit the read loop */
+                               is_eof = 1;
+                               goto write_state;
+                       }
+               }
+       }
+
+       /* At this point we have used_in bytes in fifo_in with the
+        * data head starting at cur_in and possibly wrapping around.
+        */
+
+write_state:
+
+       /* Write decompressed data to output file */
+
+       NXPRT(fprintf(stderr, "write_state:\n"));
+
+       if (used_out == 0)
+               goto decomp_state;
+
+       /* If fifo_out has data waiting, write it out to the file to
+        * make free target space for the accelerator used bytes in
+        * the first and last parts of fifo_out.
+        */
+
+       first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
+       last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
+
+       write_sz = first_used;
+
+       n = 0;
+       if (write_sz > 0) {
+               n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
+               used_out = used_out - n;
+               /* Move head of the fifo */
+               cur_out = (cur_out + n) % fifo_out_len;
+               assert(n <= write_sz);
+               if (n != write_sz) {
+                       fprintf(stderr, "error: write\n");
+                       rc = -1;
+                       goto err5;
+               }
+       }
+
+       if (last_used > 0) { /* If more data available in the last part */
+               write_sz = last_used; /* Keep it here for later */
+               n = 0;
+               if (write_sz > 0) {
+                       n = fwrite(fifo_out, 1, write_sz, outf);
+                       used_out = used_out - n;
+                       cur_out = (cur_out + n) % fifo_out_len;
+                       assert(n <= write_sz);
+                       if (n != write_sz) {
+                               fprintf(stderr, "error: write\n");
+                               rc = -1;
+                               goto err5;
+                       }
+               }
+       }
+
+decomp_state:
+
+       /* NX decompresses input data */
+
+       NXPRT(fprintf(stderr, "decomp_state:\n"));
+
+       if (is_final)
+               goto finish_state;
+
+       /* Address/len lists */
+       clearp_dde(ddl_in);
+       clearp_dde(ddl_out);
+
+       /* FC, CRC, HistLen, Table 6-6 */
+       if (resuming) {
+               /* Resuming a partially decompressed input.
+                * The key to resume is supplying the 32KB
+                * dictionary (history) to NX, which is basically
+                * the last 32KB of output produced.
+                */
+               fc = GZIP_FC_DECOMPRESS_RESUME;
+
+               cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
+               cmdp->cpb.in_adler = cmdp->cpb.out_adler;
+
+               /* Round up the history size to quadword.  Section 2.10 */
+               history_len = (history_len + 15) / 16;
+               putnn(cmdp->cpb, in_histlen, history_len);
+               history_len = history_len * 16; /* bytes */
+
+               if (history_len > 0) {
+                       /* Chain in the history buffer to the DDE list */
+                       if (cur_out >= history_len) {
+                               nx_append_dde(ddl_in, fifo_out
+                                             + (cur_out - history_len),
+                                             history_len);
+                       } else {
+                               nx_append_dde(ddl_in, fifo_out
+                                             + ((fifo_out_len + cur_out)
+                                             - history_len),
+                                             history_len - cur_out);
+                               /* Up to 32KB history wraps around fifo_out */
+                               nx_append_dde(ddl_in, fifo_out, cur_out);
+                       }
+
+               }
+       } else {
+               /* First decompress job */
+               fc = GZIP_FC_DECOMPRESS;
+
+               history_len = 0;
+               /* Writing 0 clears out subc as well */
+               cmdp->cpb.in_histlen = 0;
+               total_out = 0;
+
+               put32(cmdp->cpb, in_crc, INIT_CRC);
+               put32(cmdp->cpb, in_adler, INIT_ADLER);
+               put32(cmdp->cpb, out_crc, INIT_CRC);
+               put32(cmdp->cpb, out_adler, INIT_ADLER);
+
+               /* Assuming 10% compression ratio initially; use the
+                * most recently measured compression ratio as a
+                * heuristic to estimate the input and output
+                * sizes.  If we give too much input, the target buffer
+                * overflows and NX cycles are wasted, and then we
+                * must retry with smaller input size.  1000 is 100%.
+                */
+               last_comp_ratio = 100UL;
+       }
+       cmdp->crb.gzip_fc = 0;
+       putnn(cmdp->crb, gzip_fc, fc);
+
+       /*
+        * NX source buffers
+        */
+       first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
+       last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
+
+       if (first_used > 0)
+               nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
+
+       if (last_used > 0)
+               nx_append_dde(ddl_in, fifo_in, last_used);
+
+       /*
+        * NX target buffers
+        */
+       first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
+       last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
+
+       /* Reduce output free space amount not to overwrite the history */
+       int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
+                               - (1<<16));
+
+       NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
+                     target_max));
+
+       first_free = NX_MIN(target_max, first_free);
+       if (first_free > 0) {
+               first_offset = fifo_free_first_offset(cur_out, used_out);
+               nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
+       }
+
+       if (last_free > 0) {
+               last_free = NX_MIN(target_max - first_free, last_free);
+               if (last_free > 0) {
+                       last_offset = fifo_free_last_offset(cur_out, used_out,
+                                                           fifo_out_len);
+                       nx_append_dde(ddl_out, fifo_out + last_offset,
+                                     last_free);
+               }
+       }
+
+       /* Target buffer size is used to limit the source data size
+        * based on previous measurements of compression ratio.
+        */
+
+       /* source_sz includes history */
+       source_sz = getp32(ddl_in, ddebc);
+       assert(source_sz > history_len);
+       source_sz = source_sz - history_len;
+
+       /* Estimating how much source is needed to 3/4 fill a
+        * target_max size target buffer.  If we overshoot, then NX
+        * must repeat the job with smaller input and we waste
+        * bandwidth.  If we undershoot then we use more NX calls than
+        * necessary.
+        */
+
+       source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
+                               / 4000;
+
+       if (source_sz_estimate < source_sz) {
+               /* Target might be small, therefore limiting the
+                * source data.
+                */
+               source_sz = source_sz_estimate;
+               target_sz_estimate = target_max;
+       } else {
+               /* Source file might be small, therefore limiting target
+                * touch pages to a smaller value to save processor cycles.
+                */
+               target_sz_estimate = ((uint64_t)source_sz * 1000UL)
+                                       / (last_comp_ratio + 1);
+               target_sz_estimate = NX_MIN(2 * target_sz_estimate,
+                                           target_max);
+       }
+
+       source_sz = source_sz + history_len;
+
+       /* Some NX condition codes require submitting the NX job again.
+        * Kernel doesn't handle NX page faults. Expects user code to
+        * touch pages.
+        */
+       pgfault_retries = retry_max;
+
+restart_nx:
+
+       putp32(ddl_in, ddebc, source_sz);
+
+       /* Fault in pages */
+       nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
+       nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
+
+       /* Send job to NX */
+       cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
+
+       switch (cc) {
+
+       case ERR_NX_TRANSLATION:
+
+               /* We touched the pages ahead of time.  In the most common case
+                * we shouldn't be here.  But may be some pages were paged out.
+                * Kernel should have placed the faulting address to fsaddr.
+                */
+               NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n",
+                             (void *)cmdp->crb.csb.fsaddr));
+
+               /* Touch 1 byte, read-only  */
+               nx_touch_pages((void *)cmdp->crb.csb.fsaddr, 1, page_sz, 0);
+
+               if (pgfault_retries == retry_max) {
+                       /* Try once with exact number of pages */
+                       --pgfault_retries;
+                       goto restart_nx;
+               } else if (pgfault_retries > 0) {
+                       /* If still faulting try fewer input pages
+                        * assuming memory outage
+                        */
+                       if (source_sz > page_sz)
+                               source_sz = NX_MAX(source_sz / 2, page_sz);
+                       --pgfault_retries;
+                       goto restart_nx;
+               } else {
+                       fprintf(stderr, "cannot make progress; too many ");
+                       fprintf(stderr, "page fault retries cc= %d\n", cc);
+                       rc = -1;
+                       goto err5;
+               }
+
+       case ERR_NX_DATA_LENGTH:
+
+               NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; "));
+               NXPRT(fprintf(stderr, "stream may have trailing data\n"));
+
+               /* Not an error in the most common case; it just says
+                * there is trailing data that we must examine.
+                *
+                * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
+                * Fig.6-7 and Table 6-8.
+                */
+               nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
+
+               if (!csb_ce_termination(nx_ce) &&
+                   csb_ce_partial_completion(nx_ce)) {
+                       /* Check CPB for more information
+                        * spbc and tpbc are valid
+                        */
+                       sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
+                       subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
+                       spbc = get32(cmdp->cpb, out_spbc_decomp);
+                       tpbc = get32(cmdp->crb.csb, tpbc);
+                       assert(target_max >= tpbc);
+
+                       goto ok_cc3; /* not an error */
+               } else {
+                       /* History length error when CE(1)=1 CE(0)=0. */
+                       rc = -1;
+                       fprintf(stderr, "history length error cc= %d\n", cc);
+                       goto err5;
+               }
+
+       case ERR_NX_TARGET_SPACE:
+
+               /* Target buffer not large enough; retry smaller input
+                * data; give at least 1 byte.  SPBC/TPBC are not valid.
+                */
+               assert(source_sz > history_len);
+               source_sz = ((source_sz - history_len + 2) / 2) + history_len;
+               NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with "));
+               NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n",
+                             source_sz, history_len));
+               goto restart_nx;
+
+       case ERR_NX_OK:
+
+               /* This should not happen for gzip formatted data;
+                * we need trailing crc and isize
+                */
+               fprintf(stderr, "ERR_NX_OK\n");
+               spbc = get32(cmdp->cpb, out_spbc_decomp);
+               tpbc = get32(cmdp->crb.csb, tpbc);
+               assert(target_max >= tpbc);
+               assert(spbc >= history_len);
+               source_sz = spbc - history_len;
+               goto offsets_state;
+
+       default:
+               fprintf(stderr, "error: cc= %d\n", cc);
+               rc = -1;
+               goto err5;
+       }
+
+ok_cc3:
+
+       NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
+
+       assert(spbc > history_len);
+       source_sz = spbc - history_len;
+
+       /* Table 6-4: Source Final Block Type (SFBT) describes the
+        * last processed deflate block and clues the software how to
+        * resume the next job.  SUBC indicates how many input bits NX
+        * consumed but did not process.  SPBC indicates how many
+        * bytes of source were given to the accelerator including
+        * history bytes.
+        */
+
+       switch (sfbt) {
+               int dhtlen;
+
+       case 0x0: /* Deflate final EOB received */
+
+               /* Calculating the checksum start position. */
+
+               source_sz = source_sz - subc / 8;
+               is_final = 1;
+               break;
+
+               /* Resume decompression cases are below. Basically
+                * indicates where NX has suspended and how to resume
+                * the input stream.
+                */
+
+       case 0x8: /* Within a literal block; use rembytecount */
+       case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */
+
+               /* Supply the partially processed source byte again */
+               source_sz = source_sz - ((subc + 7) / 8);
+
+               /* SUBC LS 3bits: number of bits in the first source byte need
+                * to be processed.
+                * 000 means all 8 bits;  Table 6-3
+                * Clear subc, histlen, sfbt, rembytecnt, dhtlen
+                */
+               cmdp->cpb.in_subc = 0;
+               cmdp->cpb.in_sfbt = 0;
+               putnn(cmdp->cpb, in_subc, subc % 8);
+               putnn(cmdp->cpb, in_sfbt, sfbt);
+               putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
+                                                     out_rembytecnt));
+               break;
+
+       case 0xA: /* Within a FH block; */
+       case 0xB: /* Within a FH block; bfinal=1 */
+
+               source_sz = source_sz - ((subc + 7) / 8);
+
+               /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+               cmdp->cpb.in_subc = 0;
+               cmdp->cpb.in_sfbt = 0;
+               putnn(cmdp->cpb, in_subc, subc % 8);
+               putnn(cmdp->cpb, in_sfbt, sfbt);
+               break;
+
+       case 0xC: /* Within a DH block; */
+       case 0xD: /* Within a DH block; bfinal=1 */
+
+               source_sz = source_sz - ((subc + 7) / 8);
+
+               /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+               cmdp->cpb.in_subc = 0;
+               cmdp->cpb.in_sfbt = 0;
+               putnn(cmdp->cpb, in_subc, subc % 8);
+               putnn(cmdp->cpb, in_sfbt, sfbt);
+
+               dhtlen = getnn(cmdp->cpb, out_dhtlen);
+               putnn(cmdp->cpb, in_dhtlen, dhtlen);
+               assert(dhtlen >= 42);
+
+               /* Round up to a qword */
+               dhtlen = (dhtlen + 127) / 128;
+
+               while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
+                       --dhtlen;
+                       cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
+               }
+               break;
+
+       case 0xE: /* Within a block header; bfinal=0; */
+                    /* Also given if source data exactly ends (SUBC=0) with
+                     * EOB code with BFINAL=0.  Means the next byte will
+                     * contain a block header.
+                     */
+       case 0xF: /* within a block header with BFINAL=1. */
+
+               source_sz = source_sz - ((subc + 7) / 8);
+
+               /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+               cmdp->cpb.in_subc = 0;
+               cmdp->cpb.in_sfbt = 0;
+               putnn(cmdp->cpb, in_subc, subc % 8);
+               putnn(cmdp->cpb, in_sfbt, sfbt);
+
+               /* Engine did not process any data */
+               if (is_eof && (source_sz == 0))
+                       is_final = 1;
+       }
+
+offsets_state:
+
+       /* Adjust the source and target buffer offsets and lengths  */
+
+       NXPRT(fprintf(stderr, "offsets_state:\n"));
+
+       /* Delete input data from fifo_in */
+       used_in = used_in - source_sz;
+       cur_in = (cur_in + source_sz) % fifo_in_len;
+       input_file_offset = input_file_offset + source_sz;
+
+       /* Add output data to fifo_out */
+       used_out = used_out + tpbc;
+
+       assert(used_out <= fifo_out_len);
+
+       total_out = total_out + tpbc;
+
+       /* Deflate history is 32KB max.  No need to supply more
+        * than 32KB on a resume.
+        */
+       history_len = (total_out > window_max) ? window_max : total_out;
+
+       /* To estimate expected expansion in the next NX job; 500 means 50%.
+        * Deflate best case is around 1 to 1000.
+        */
+       last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
+                         / ((uint64_t)tpbc + 1);
+       last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
+       NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
+                     last_comp_ratio, source_sz, spbc, tpbc));
+
+       resuming = 1;
+
+finish_state:
+
+       NXPRT(fprintf(stderr, "finish_state:\n"));
+
+       if (is_final) {
+               if (used_out)
+                       goto write_state; /* More data to write out */
+               else if (used_in < 8) {
+                       /* Need at least 8 more bytes containing gzip crc
+                        * and isize.
+                        */
+                       rc = -1;
+                       goto err4;
+               } else {
+                       /* Compare checksums and exit */
+                       int i;
+                       unsigned char tail[8];
+                       uint32_t cksum, isize;
+
+                       for (i = 0; i < 8; i++)
+                               tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
+                       fprintf(stderr, "computed checksum %08x isize %08x\n",
+                               cmdp->cpb.out_crc, (uint32_t) (total_out
+                               % (1ULL<<32)));
+                       cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8
+                                | (uint32_t) tail[2]<<16
+                                | (uint32_t) tail[3]<<24);
+                       isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8
+                                | (uint32_t) tail[6]<<16
+                                | (uint32_t) tail[7]<<24);
+                       fprintf(stderr, "stored   checksum %08x isize %08x\n",
+                               cksum, isize);
+
+                       if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
+                           (total_out % (1ULL<<32))) {
+                               rc = 0; goto ok1;
+                       } else {
+                               rc = -1; goto err4;
+                       }
+               }
+       } else
+               goto read_state;
+
+       return -1;
+
+err1:
+       fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
+               expect, c);
+       return -1;
+
+err2:
+       fprintf(stderr, "error: the FLG byte is wrong or not being handled\n");
+       return -1;
+
+err3:
+       fprintf(stderr, "error: gzip header\n");
+       return -1;
+
+err4:
+       fprintf(stderr, "error: checksum missing or mismatch\n");
+
+err5:
+ok1:
+       fprintf(stderr, "decomp is complete: fclose\n");
+       fclose(outf);
+
+       return rc;
+}
+
+
+int main(int argc, char **argv)
+{
+       int rc;
+       struct sigaction act;
+       void *handle;
+
+       nx_dbg = 0;
+       nx_gzip_log = NULL;
+       act.sa_handler = 0;
+       act.sa_sigaction = sigsegv_handler;
+       act.sa_flags = SA_SIGINFO;
+       act.sa_restorer = 0;
+       sigemptyset(&act.sa_mask);
+       sigaction(SIGSEGV, &act, NULL);
+
+       handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
+       if (!handle) {
+               fprintf(stderr, "Unable to init NX, errno %d\n", errno);
+               exit(-1);
+       }
+
+       rc = decompress_file(argc, argv, handle);
+
+       nx_function_end(handle);
+
+       return rc;
+}
-- 
2.21.0

Reply via email to