cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gunz_test.c (28085B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2
      3/* P9 gunzip sample code for demonstrating the P9 NX hardware
      4 * interface.  Not intended for productive uses or for performance or
      5 * compression ratio measurements.  Note also that /dev/crypto/gzip,
      6 * VAS and skiboot support are required
      7 *
      8 * Copyright 2020 IBM Corp.
      9 *
     10 * Author: Bulent Abali <abali@us.ibm.com>
     11 *
     12 * https://github.com/libnxz/power-gzip for zlib api and other utils
     13 * Definitions of acronyms used here.  See
     14 * P9 NX Gzip Accelerator User's Manual for details:
     15 * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
     16 *
     17 * adler/crc: 32 bit checksums appended to stream tail
     18 * ce:       completion extension
     19 * cpb:      coprocessor parameter block (metadata)
     20 * crb:      coprocessor request block (command)
     21 * csb:      coprocessor status block (status)
     22 * dht:      dynamic huffman table
     23 * dde:      data descriptor element (address, length)
     24 * ddl:      list of ddes
     25 * dh/fh:    dynamic and fixed huffman types
     26 * fc:       coprocessor function code
     27 * histlen:  history/dictionary length
     28 * history:  sliding window of up to 32KB of data
     29 * lzcount:  Deflate LZ symbol counts
     30 * rembytecnt: remaining byte count
     31 * sfbt:     source final block type; last block's type during decomp
     32 * spbc:     source processed byte count
     33 * subc:     source unprocessed bit count
     34 * tebc:     target ending bit count; valid bits in the last byte
     35 * tpbc:     target processed byte count
     36 * vas:      virtual accelerator switch; the user mode interface
     37 */
     38
     39#define _ISOC11_SOURCE	// For aligned_alloc()
     40#define _DEFAULT_SOURCE	// For endian.h
     41
     42#include <stdio.h>
     43#include <stdlib.h>
     44#include <string.h>
     45#include <unistd.h>
     46#include <stdint.h>
     47#include <sys/types.h>
     48#include <sys/stat.h>
     49#include <sys/time.h>
     50#include <sys/fcntl.h>
     51#include <sys/mman.h>
     52#include <endian.h>
     53#include <bits/endian.h>
     54#include <sys/ioctl.h>
     55#include <assert.h>
     56#include <errno.h>
     57#include <signal.h>
     58#include "nxu.h"
     59#include "nx.h"
     60#include "crb.h"
     61
     62int nx_dbg;
     63FILE *nx_gzip_log;
     64
     65#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
     66#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
     67
     68#define GETINPC(X) fgetc(X)
     69#define FNAME_MAX 1024
     70
     71/* fifo queue management */
     72#define fifo_used_bytes(used) (used)
     73#define fifo_free_bytes(used, len) ((len)-(used))
     74/* amount of free bytes in the first and last parts */
     75#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
     76						  ? (len)-((cur)+(used)) : 0)
     77#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
     78						  ? (cur) : (len)-(used))
     79/* amount of used bytes in the first and last parts */
     80#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
     81						  ? (used) : (len)-(cur))
     82#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
     83						  ? 0 : ((used)+(cur))-(len))
     84/* first and last free parts start here */
     85#define fifo_free_first_offset(cur, used)      ((cur)+(used))
     86#define fifo_free_last_offset(cur, used, len)  \
     87					   fifo_used_last_bytes(cur, used, len)
     88/* first and last used parts start here */
     89#define fifo_used_first_offset(cur)            (cur)
     90#define fifo_used_last_offset(cur)             (0)
     91
     92const int fifo_in_len = 1<<24;
     93const int fifo_out_len = 1<<24;
     94const int page_sz = 1<<16;
     95const int line_sz = 1<<7;
     96const int window_max = 1<<15;
     97
     98/*
     99 * Adds an (address, len) pair to the list of ddes (ddl) and updates
    100 * the base dde.  ddl[0] is the only dde in a direct dde which
    101 * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
    102 * the indirect (base) dde that points to a list of direct ddes.
    103 * See Section 6.4 of the NX-gzip user manual for DDE description.
    104 * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
    105 * bytes in ddl.  Caller is responsible for allocting the array of
    106 * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
    107 * list, the ddl array must have N+1 entries minimum.
    108 */
    109static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr,
    110					uint32_t len)
    111{
    112	uint32_t ddecnt;
    113	uint32_t bytes;
    114
    115	if (addr == NULL && len == 0) {
    116		clearp_dde(ddl);
    117		return 0;
    118	}
    119
    120	NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr,
    121			__func__, len));
    122
    123	/* Number of ddes in the dde list ; == 0 when it is a direct dde */
    124	ddecnt = getpnn(ddl, dde_count);
    125	bytes = getp32(ddl, ddebc);
    126
    127	if (ddecnt == 0 && bytes == 0) {
    128		/* First dde is unused; make it a direct dde */
    129		bytes = len;
    130		putp32(ddl, ddebc, bytes);
    131		putp64(ddl, ddead, (uint64_t) addr);
    132	} else if (ddecnt == 0) {
    133		/* Converting direct to indirect dde
    134		 * ddl[0] becomes head dde of ddl
    135		 * copy direct to indirect first.
    136		 */
    137		ddl[1] = ddl[0];
    138
    139		/* Add the new dde next */
    140		clear_dde(ddl[2]);
    141		put32(ddl[2], ddebc, len);
    142		put64(ddl[2], ddead, (uint64_t) addr);
    143
    144		/* Ddl head points to 2 direct ddes */
    145		ddecnt = 2;
    146		putpnn(ddl, dde_count, ddecnt);
    147		bytes = bytes + len;
    148		putp32(ddl, ddebc, bytes);
    149		/* Pointer to the first direct dde */
    150		putp64(ddl, ddead, (uint64_t) &ddl[1]);
    151	} else {
    152		/* Append a dde to an existing indirect ddl */
    153		++ddecnt;
    154		clear_dde(ddl[ddecnt]);
    155		put64(ddl[ddecnt], ddead, (uint64_t) addr);
    156		put32(ddl[ddecnt], ddebc, len);
    157
    158		putpnn(ddl, dde_count, ddecnt);
    159		bytes = bytes + len;
    160		putp32(ddl, ddebc, bytes); /* byte sum of all dde */
    161	}
    162	return bytes;
    163}
    164
    165/*
    166 * Touch specified number of pages represented in number bytes
    167 * beginning from the first buffer in a dde list.
    168 * Do not touch the pages past buf_sz-th byte's page.
    169 *
    170 * Set buf_sz = 0 to touch all pages described by the ddep.
    171 */
    172static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz,
    173				int wr)
    174{
    175	uint32_t indirect_count;
    176	uint32_t buf_len;
    177	long total;
    178	uint64_t buf_addr;
    179	struct nx_dde_t *dde_list;
    180	int i;
    181
    182	assert(!!ddep);
    183
    184	indirect_count = getpnn(ddep, dde_count);
    185
    186	NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__,
    187			indirect_count));
    188	NXPRT(fprintf(stderr, "0x%lx\n", buf_sz));
    189
    190	if (indirect_count == 0) {
    191		/* Direct dde */
    192		buf_len = getp32(ddep, ddebc);
    193		buf_addr = getp64(ddep, ddead);
    194
    195		NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
    196				buf_len, (void *)buf_addr));
    197
    198		if (buf_sz == 0)
    199			nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
    200		else
    201			nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len,
    202					buf_sz), page_sz, wr);
    203
    204		return ERR_NX_OK;
    205	}
    206
    207	/* Indirect dde */
    208	if (indirect_count > MAX_DDE_COUNT)
    209		return ERR_NX_EXCESSIVE_DDE;
    210
    211	/* First address of the list */
    212	dde_list = (struct nx_dde_t *) getp64(ddep, ddead);
    213
    214	if (buf_sz == 0)
    215		buf_sz = getp32(ddep, ddebc);
    216
    217	total = 0;
    218	for (i = 0; i < indirect_count; i++) {
    219		buf_len = get32(dde_list[i], ddebc);
    220		buf_addr = get64(dde_list[i], ddead);
    221		total += buf_len;
    222
    223		NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ",
    224				buf_len, (void *)buf_addr));
    225		NXPRT(fprintf(stderr, "0x%lx\n", total));
    226
    227		/* Touching fewer pages than encoded in the ddebc */
    228		if (total > buf_sz) {
    229			buf_len = NX_MIN(buf_len, total - buf_sz);
    230			nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
    231			NXPRT(fprintf(stderr, "touch loop break len 0x%x ",
    232				      buf_len));
    233			NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr));
    234			break;
    235		}
    236		nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
    237	}
    238	return ERR_NX_OK;
    239}
    240
    241/*
    242 * Src and dst buffers are supplied in scatter gather lists.
    243 * NX function code and other parameters supplied in cmdp.
    244 */
    245static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst,
    246			 struct nx_gzip_crb_cpb_t *cmdp, void *handle)
    247{
    248	uint64_t csbaddr;
    249
    250	memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
    251
    252	cmdp->crb.source_dde = *src;
    253	cmdp->crb.target_dde = *dst;
    254
    255	/* Status, output byte count in tpbc */
    256	csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
    257	put64(cmdp->crb, csb_address, csbaddr);
    258
    259	/* NX reports input bytes in spbc; cleared */
    260	cmdp->cpb.out_spbc_comp_wrap = 0;
    261	cmdp->cpb.out_spbc_comp_with_count = 0;
    262	cmdp->cpb.out_spbc_decomp = 0;
    263
    264	/* Clear output */
    265	put32(cmdp->cpb, out_crc, INIT_CRC);
    266	put32(cmdp->cpb, out_adler, INIT_ADLER);
    267
    268	/* Submit the crb, the job descriptor, to the accelerator. */
    269	return nxu_submit_job(cmdp, handle);
    270}
    271
    272int decompress_file(int argc, char **argv, void *devhandle)
    273{
    274	FILE *inpf = NULL;
    275	FILE *outf = NULL;
    276
    277	int c, expect, i, cc, rc = 0;
    278	char gzfname[FNAME_MAX];
    279
    280	/* Queuing, file ops, byte counting */
    281	char *fifo_in, *fifo_out;
    282	int used_in, cur_in, used_out, cur_out, read_sz, n;
    283	int first_free, last_free, first_used, last_used;
    284	int first_offset, last_offset;
    285	int write_sz, free_space, source_sz;
    286	int source_sz_estimate, target_sz_estimate;
    287	uint64_t last_comp_ratio = 0; /* 1000 max */
    288	uint64_t total_out = 0;
    289	int is_final, is_eof;
    290
    291	/* nx hardware */
    292	int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
    293	int history_len = 0;
    294	struct nx_gzip_crb_cpb_t cmd, *cmdp;
    295	struct nx_dde_t *ddl_in;
    296	struct nx_dde_t dde_in[6] __aligned(128);
    297	struct nx_dde_t *ddl_out;
    298	struct nx_dde_t dde_out[6] __aligned(128);
    299	int pgfault_retries;
    300
    301	/* when using mmap'ed files */
    302	off_t input_file_offset;
    303
    304	if (argc > 2) {
    305		fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
    306		fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
    307		return -1;
    308	}
    309
    310	if (argc == 1) {
    311		inpf = stdin;
    312		outf = stdout;
    313	} else if (argc == 2) {
    314		char w[1024];
    315		char *wp;
    316
    317		inpf = fopen(argv[1], "r");
    318		if (inpf == NULL) {
    319			perror(argv[1]);
    320			return -1;
    321		}
    322
    323		/* Make a new file name to write to.  Ignoring '.gz' */
    324		wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1];
    325		strcpy(w, wp);
    326		strcat(w, ".nx.gunzip");
    327
    328		outf = fopen(w, "w");
    329		if (outf == NULL) {
    330			perror(w);
    331			return -1;
    332		}
    333	}
    334
    335	/* Decode the gzip header */
    336	c = GETINPC(inpf); expect = 0x1f; /* ID1 */
    337	if (c != expect)
    338		goto err1;
    339
    340	c = GETINPC(inpf); expect = 0x8b; /* ID2 */
    341	if (c != expect)
    342		goto err1;
    343
    344	c = GETINPC(inpf); expect = 0x08; /* CM */
    345	if (c != expect)
    346		goto err1;
    347
    348	int flg = GETINPC(inpf); /* FLG */
    349
    350	if (flg & 0xE0 || flg & 0x4 || flg == EOF)
    351		goto err2;
    352
    353	fprintf(stderr, "gzHeader FLG %x\n", flg);
    354
    355	/* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
    356	 * sample code.
    357	 */
    358	for (i = 0; i < 6; i++) {
    359		char tmp[10];
    360
    361		tmp[i] = GETINPC(inpf);
    362		if (tmp[i] == EOF)
    363			goto err3;
    364		fprintf(stderr, "%02x ", tmp[i]);
    365		if (i == 5)
    366			fprintf(stderr, "\n");
    367	}
    368	fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
    369
    370	/* FNAME */
    371	if (flg & 0x8) {
    372		int k = 0;
    373
    374		do {
    375			c = GETINPC(inpf);
    376			if (c == EOF || k >= FNAME_MAX)
    377				goto err3;
    378			gzfname[k++] = c;
    379		} while (c);
    380		fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
    381	}
    382
    383	/* FHCRC */
    384	if (flg & 0x2) {
    385		c = GETINPC(inpf);
    386		if (c == EOF)
    387			goto err3;
    388		c = GETINPC(inpf);
    389		if (c == EOF)
    390			goto err3;
    391		fprintf(stderr, "gzHeader FHCRC: ignored\n");
    392	}
    393
    394	used_in = cur_in = used_out = cur_out = 0;
    395	is_final = is_eof = 0;
    396
    397	/* Allocate one page larger to prevent page faults due to NX
    398	 * overfetching.
    399	 * Either do this (char*)(uintptr_t)aligned_alloc or use
    400	 * -std=c11 flag to make the int-to-pointer warning go away.
    401	 */
    402	assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
    403				   fifo_in_len + page_sz)) != NULL);
    404	assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
    405				   fifo_out_len + page_sz + line_sz)) != NULL);
    406	/* Leave unused space due to history rounding rules */
    407	fifo_out = fifo_out + line_sz;
    408	nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
    409
    410	ddl_in  = &dde_in[0];
    411	ddl_out = &dde_out[0];
    412	cmdp = &cmd;
    413	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
    414
    415read_state:
    416
    417	/* Read from .gz file */
    418
    419	NXPRT(fprintf(stderr, "read_state:\n"));
    420
    421	if (is_eof != 0)
    422		goto write_state;
    423
    424	/* We read in to fifo_in in two steps: first: read in to from
    425	 * cur_in to the end of the buffer.  last: if free space wrapped
    426	 * around, read from fifo_in offset 0 to offset cur_in.
    427	 */
    428
    429	/* Reset fifo head to reduce unnecessary wrap arounds */
    430	cur_in = (used_in == 0) ? 0 : cur_in;
    431
    432	/* Free space total is reduced by a gap */
    433	free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
    434			    - line_sz);
    435
    436	/* Free space may wrap around as first and last */
    437	first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
    438	last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
    439
    440	/* Start offsets of the free memory */
    441	first_offset = fifo_free_first_offset(cur_in, used_in);
    442	last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
    443
    444	/* Reduce read_sz because of the line_sz gap */
    445	read_sz = NX_MIN(free_space, first_free);
    446	n = 0;
    447	if (read_sz > 0) {
    448		/* Read in to offset cur_in + used_in */
    449		n = fread(fifo_in + first_offset, 1, read_sz, inpf);
    450		used_in = used_in + n;
    451		free_space = free_space - n;
    452		assert(n <= read_sz);
    453		if (n != read_sz) {
    454			/* Either EOF or error; exit the read loop */
    455			is_eof = 1;
    456			goto write_state;
    457		}
    458	}
    459
    460	/* If free space wrapped around */
    461	if (last_free > 0) {
    462		/* Reduce read_sz because of the line_sz gap */
    463		read_sz = NX_MIN(free_space, last_free);
    464		n = 0;
    465		if (read_sz > 0) {
    466			n = fread(fifo_in + last_offset, 1, read_sz, inpf);
    467			used_in = used_in + n;       /* Increase used space */
    468			free_space = free_space - n; /* Decrease free space */
    469			assert(n <= read_sz);
    470			if (n != read_sz) {
    471				/* Either EOF or error; exit the read loop */
    472				is_eof = 1;
    473				goto write_state;
    474			}
    475		}
    476	}
    477
    478	/* At this point we have used_in bytes in fifo_in with the
    479	 * data head starting at cur_in and possibly wrapping around.
    480	 */
    481
    482write_state:
    483
    484	/* Write decompressed data to output file */
    485
    486	NXPRT(fprintf(stderr, "write_state:\n"));
    487
    488	if (used_out == 0)
    489		goto decomp_state;
    490
    491	/* If fifo_out has data waiting, write it out to the file to
    492	 * make free target space for the accelerator used bytes in
    493	 * the first and last parts of fifo_out.
    494	 */
    495
    496	first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
    497	last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
    498
    499	write_sz = first_used;
    500
    501	n = 0;
    502	if (write_sz > 0) {
    503		n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
    504		used_out = used_out - n;
    505		/* Move head of the fifo */
    506		cur_out = (cur_out + n) % fifo_out_len;
    507		assert(n <= write_sz);
    508		if (n != write_sz) {
    509			fprintf(stderr, "error: write\n");
    510			rc = -1;
    511			goto err5;
    512		}
    513	}
    514
    515	if (last_used > 0) { /* If more data available in the last part */
    516		write_sz = last_used; /* Keep it here for later */
    517		n = 0;
    518		if (write_sz > 0) {
    519			n = fwrite(fifo_out, 1, write_sz, outf);
    520			used_out = used_out - n;
    521			cur_out = (cur_out + n) % fifo_out_len;
    522			assert(n <= write_sz);
    523			if (n != write_sz) {
    524				fprintf(stderr, "error: write\n");
    525				rc = -1;
    526				goto err5;
    527			}
    528		}
    529	}
    530
    531decomp_state:
    532
    533	/* NX decompresses input data */
    534
    535	NXPRT(fprintf(stderr, "decomp_state:\n"));
    536
    537	if (is_final)
    538		goto finish_state;
    539
    540	/* Address/len lists */
    541	clearp_dde(ddl_in);
    542	clearp_dde(ddl_out);
    543
    544	/* FC, CRC, HistLen, Table 6-6 */
    545	if (resuming) {
    546		/* Resuming a partially decompressed input.
    547		 * The key to resume is supplying the 32KB
    548		 * dictionary (history) to NX, which is basically
    549		 * the last 32KB of output produced.
    550		 */
    551		fc = GZIP_FC_DECOMPRESS_RESUME;
    552
    553		cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
    554		cmdp->cpb.in_adler = cmdp->cpb.out_adler;
    555
    556		/* Round up the history size to quadword.  Section 2.10 */
    557		history_len = (history_len + 15) / 16;
    558		putnn(cmdp->cpb, in_histlen, history_len);
    559		history_len = history_len * 16; /* bytes */
    560
    561		if (history_len > 0) {
    562			/* Chain in the history buffer to the DDE list */
    563			if (cur_out >= history_len) {
    564				nx_append_dde(ddl_in, fifo_out
    565					      + (cur_out - history_len),
    566					      history_len);
    567			} else {
    568				nx_append_dde(ddl_in, fifo_out
    569					      + ((fifo_out_len + cur_out)
    570					      - history_len),
    571					      history_len - cur_out);
    572				/* Up to 32KB history wraps around fifo_out */
    573				nx_append_dde(ddl_in, fifo_out, cur_out);
    574			}
    575
    576		}
    577	} else {
    578		/* First decompress job */
    579		fc = GZIP_FC_DECOMPRESS;
    580
    581		history_len = 0;
    582		/* Writing 0 clears out subc as well */
    583		cmdp->cpb.in_histlen = 0;
    584		total_out = 0;
    585
    586		put32(cmdp->cpb, in_crc, INIT_CRC);
    587		put32(cmdp->cpb, in_adler, INIT_ADLER);
    588		put32(cmdp->cpb, out_crc, INIT_CRC);
    589		put32(cmdp->cpb, out_adler, INIT_ADLER);
    590
    591		/* Assuming 10% compression ratio initially; use the
    592		 * most recently measured compression ratio as a
    593		 * heuristic to estimate the input and output
    594		 * sizes.  If we give too much input, the target buffer
    595		 * overflows and NX cycles are wasted, and then we
    596		 * must retry with smaller input size.  1000 is 100%.
    597		 */
    598		last_comp_ratio = 100UL;
    599	}
    600	cmdp->crb.gzip_fc = 0;
    601	putnn(cmdp->crb, gzip_fc, fc);
    602
    603	/*
    604	 * NX source buffers
    605	 */
    606	first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
    607	last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
    608
    609	if (first_used > 0)
    610		nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
    611
    612	if (last_used > 0)
    613		nx_append_dde(ddl_in, fifo_in, last_used);
    614
    615	/*
    616	 * NX target buffers
    617	 */
    618	first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
    619	last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
    620
    621	/* Reduce output free space amount not to overwrite the history */
    622	int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
    623				- (1<<16));
    624
    625	NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
    626		      target_max));
    627
    628	first_free = NX_MIN(target_max, first_free);
    629	if (first_free > 0) {
    630		first_offset = fifo_free_first_offset(cur_out, used_out);
    631		nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
    632	}
    633
    634	if (last_free > 0) {
    635		last_free = NX_MIN(target_max - first_free, last_free);
    636		if (last_free > 0) {
    637			last_offset = fifo_free_last_offset(cur_out, used_out,
    638							    fifo_out_len);
    639			nx_append_dde(ddl_out, fifo_out + last_offset,
    640				      last_free);
    641		}
    642	}
    643
    644	/* Target buffer size is used to limit the source data size
    645	 * based on previous measurements of compression ratio.
    646	 */
    647
    648	/* source_sz includes history */
    649	source_sz = getp32(ddl_in, ddebc);
    650	assert(source_sz > history_len);
    651	source_sz = source_sz - history_len;
    652
    653	/* Estimating how much source is needed to 3/4 fill a
    654	 * target_max size target buffer.  If we overshoot, then NX
    655	 * must repeat the job with smaller input and we waste
    656	 * bandwidth.  If we undershoot then we use more NX calls than
    657	 * necessary.
    658	 */
    659
    660	source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
    661				/ 4000;
    662
    663	if (source_sz_estimate < source_sz) {
    664		/* Target might be small, therefore limiting the
    665		 * source data.
    666		 */
    667		source_sz = source_sz_estimate;
    668		target_sz_estimate = target_max;
    669	} else {
    670		/* Source file might be small, therefore limiting target
    671		 * touch pages to a smaller value to save processor cycles.
    672		 */
    673		target_sz_estimate = ((uint64_t)source_sz * 1000UL)
    674					/ (last_comp_ratio + 1);
    675		target_sz_estimate = NX_MIN(2 * target_sz_estimate,
    676					    target_max);
    677	}
    678
    679	source_sz = source_sz + history_len;
    680
    681	/* Some NX condition codes require submitting the NX job again.
    682	 * Kernel doesn't handle NX page faults. Expects user code to
    683	 * touch pages.
    684	 */
    685	pgfault_retries = NX_MAX_FAULTS;
    686
    687restart_nx:
    688
    689	putp32(ddl_in, ddebc, source_sz);
    690
    691	/* Fault in pages */
    692	nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1);
    693	nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
    694	nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
    695
    696	/* Send job to NX */
    697	cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
    698
    699	switch (cc) {
    700
    701	case ERR_NX_AT_FAULT:
    702
    703		/* We touched the pages ahead of time.  In the most common case
    704		 * we shouldn't be here.  But may be some pages were paged out.
    705		 * Kernel should have placed the faulting address to fsaddr.
    706		 */
    707		NXPRT(fprintf(stderr, "ERR_NX_AT_FAULT %p\n",
    708			      (void *)cmdp->crb.csb.fsaddr));
    709
    710		if (pgfault_retries == NX_MAX_FAULTS) {
    711			/* Try once with exact number of pages */
    712			--pgfault_retries;
    713			goto restart_nx;
    714		} else if (pgfault_retries > 0) {
    715			/* If still faulting try fewer input pages
    716			 * assuming memory outage
    717			 */
    718			if (source_sz > page_sz)
    719				source_sz = NX_MAX(source_sz / 2, page_sz);
    720			--pgfault_retries;
    721			goto restart_nx;
    722		} else {
    723			fprintf(stderr, "cannot make progress; too many ");
    724			fprintf(stderr, "page fault retries cc= %d\n", cc);
    725			rc = -1;
    726			goto err5;
    727		}
    728
    729	case ERR_NX_DATA_LENGTH:
    730
    731		NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; "));
    732		NXPRT(fprintf(stderr, "stream may have trailing data\n"));
    733
    734		/* Not an error in the most common case; it just says
    735		 * there is trailing data that we must examine.
    736		 *
    737		 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
    738		 * Fig.6-7 and Table 6-8.
    739		 */
    740		nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
    741
    742		if (!csb_ce_termination(nx_ce) &&
    743		    csb_ce_partial_completion(nx_ce)) {
    744			/* Check CPB for more information
    745			 * spbc and tpbc are valid
    746			 */
    747			sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
    748			subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
    749			spbc = get32(cmdp->cpb, out_spbc_decomp);
    750			tpbc = get32(cmdp->crb.csb, tpbc);
    751			assert(target_max >= tpbc);
    752
    753			goto ok_cc3; /* not an error */
    754		} else {
    755			/* History length error when CE(1)=1 CE(0)=0. */
    756			rc = -1;
    757			fprintf(stderr, "history length error cc= %d\n", cc);
    758			goto err5;
    759		}
    760
    761	case ERR_NX_TARGET_SPACE:
    762
    763		/* Target buffer not large enough; retry smaller input
    764		 * data; give at least 1 byte.  SPBC/TPBC are not valid.
    765		 */
    766		assert(source_sz > history_len);
    767		source_sz = ((source_sz - history_len + 2) / 2) + history_len;
    768		NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with "));
    769		NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n",
    770			      source_sz, history_len));
    771		goto restart_nx;
    772
    773	case ERR_NX_OK:
    774
    775		/* This should not happen for gzip formatted data;
    776		 * we need trailing crc and isize
    777		 */
    778		fprintf(stderr, "ERR_NX_OK\n");
    779		spbc = get32(cmdp->cpb, out_spbc_decomp);
    780		tpbc = get32(cmdp->crb.csb, tpbc);
    781		assert(target_max >= tpbc);
    782		assert(spbc >= history_len);
    783		source_sz = spbc - history_len;
    784		goto offsets_state;
    785
    786	default:
    787		fprintf(stderr, "error: cc= %d\n", cc);
    788		rc = -1;
    789		goto err5;
    790	}
    791
    792ok_cc3:
    793
    794	NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
    795
    796	assert(spbc > history_len);
    797	source_sz = spbc - history_len;
    798
    799	/* Table 6-4: Source Final Block Type (SFBT) describes the
    800	 * last processed deflate block and clues the software how to
    801	 * resume the next job.  SUBC indicates how many input bits NX
    802	 * consumed but did not process.  SPBC indicates how many
    803	 * bytes of source were given to the accelerator including
    804	 * history bytes.
    805	 */
    806
    807	switch (sfbt) {
    808		int dhtlen;
    809
    810	case 0x0: /* Deflate final EOB received */
    811
    812		/* Calculating the checksum start position. */
    813
    814		source_sz = source_sz - subc / 8;
    815		is_final = 1;
    816		break;
    817
    818		/* Resume decompression cases are below. Basically
    819		 * indicates where NX has suspended and how to resume
    820		 * the input stream.
    821		 */
    822
    823	case 0x8: /* Within a literal block; use rembytecount */
    824	case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */
    825
    826		/* Supply the partially processed source byte again */
    827		source_sz = source_sz - ((subc + 7) / 8);
    828
    829		/* SUBC LS 3bits: number of bits in the first source byte need
    830		 * to be processed.
    831		 * 000 means all 8 bits;  Table 6-3
    832		 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
    833		 */
    834		cmdp->cpb.in_subc = 0;
    835		cmdp->cpb.in_sfbt = 0;
    836		putnn(cmdp->cpb, in_subc, subc % 8);
    837		putnn(cmdp->cpb, in_sfbt, sfbt);
    838		putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
    839						      out_rembytecnt));
    840		break;
    841
    842	case 0xA: /* Within a FH block; */
    843	case 0xB: /* Within a FH block; bfinal=1 */
    844
    845		source_sz = source_sz - ((subc + 7) / 8);
    846
    847		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
    848		cmdp->cpb.in_subc = 0;
    849		cmdp->cpb.in_sfbt = 0;
    850		putnn(cmdp->cpb, in_subc, subc % 8);
    851		putnn(cmdp->cpb, in_sfbt, sfbt);
    852		break;
    853
    854	case 0xC: /* Within a DH block; */
    855	case 0xD: /* Within a DH block; bfinal=1 */
    856
    857		source_sz = source_sz - ((subc + 7) / 8);
    858
    859		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
    860		cmdp->cpb.in_subc = 0;
    861		cmdp->cpb.in_sfbt = 0;
    862		putnn(cmdp->cpb, in_subc, subc % 8);
    863		putnn(cmdp->cpb, in_sfbt, sfbt);
    864
    865		dhtlen = getnn(cmdp->cpb, out_dhtlen);
    866		putnn(cmdp->cpb, in_dhtlen, dhtlen);
    867		assert(dhtlen >= 42);
    868
    869		/* Round up to a qword */
    870		dhtlen = (dhtlen + 127) / 128;
    871
    872		while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
    873			--dhtlen;
    874			cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
    875		}
    876		break;
    877
    878	case 0xE: /* Within a block header; bfinal=0; */
    879		     /* Also given if source data exactly ends (SUBC=0) with
    880		      * EOB code with BFINAL=0.  Means the next byte will
    881		      * contain a block header.
    882		      */
    883	case 0xF: /* within a block header with BFINAL=1. */
    884
    885		source_sz = source_sz - ((subc + 7) / 8);
    886
    887		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
    888		cmdp->cpb.in_subc = 0;
    889		cmdp->cpb.in_sfbt = 0;
    890		putnn(cmdp->cpb, in_subc, subc % 8);
    891		putnn(cmdp->cpb, in_sfbt, sfbt);
    892
    893		/* Engine did not process any data */
    894		if (is_eof && (source_sz == 0))
    895			is_final = 1;
    896	}
    897
    898offsets_state:
    899
    900	/* Adjust the source and target buffer offsets and lengths  */
    901
    902	NXPRT(fprintf(stderr, "offsets_state:\n"));
    903
    904	/* Delete input data from fifo_in */
    905	used_in = used_in - source_sz;
    906	cur_in = (cur_in + source_sz) % fifo_in_len;
    907	input_file_offset = input_file_offset + source_sz;
    908
    909	/* Add output data to fifo_out */
    910	used_out = used_out + tpbc;
    911
    912	assert(used_out <= fifo_out_len);
    913
    914	total_out = total_out + tpbc;
    915
    916	/* Deflate history is 32KB max.  No need to supply more
    917	 * than 32KB on a resume.
    918	 */
    919	history_len = (total_out > window_max) ? window_max : total_out;
    920
    921	/* To estimate expected expansion in the next NX job; 500 means 50%.
    922	 * Deflate best case is around 1 to 1000.
    923	 */
    924	last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
    925			  / ((uint64_t)tpbc + 1);
    926	last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
    927	NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
    928		      last_comp_ratio, source_sz, spbc, tpbc));
    929
    930	resuming = 1;
    931
    932finish_state:
    933
    934	NXPRT(fprintf(stderr, "finish_state:\n"));
    935
    936	if (is_final) {
    937		if (used_out)
    938			goto write_state; /* More data to write out */
    939		else if (used_in < 8) {
    940			/* Need at least 8 more bytes containing gzip crc
    941			 * and isize.
    942			 */
    943			rc = -1;
    944			goto err4;
    945		} else {
    946			/* Compare checksums and exit */
    947			int i;
    948			unsigned char tail[8];
    949			uint32_t cksum, isize;
    950
    951			for (i = 0; i < 8; i++)
    952				tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
    953			fprintf(stderr, "computed checksum %08x isize %08x\n",
    954				cmdp->cpb.out_crc, (uint32_t) (total_out
    955				% (1ULL<<32)));
    956			cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8
    957				 | (uint32_t) tail[2]<<16
    958				 | (uint32_t) tail[3]<<24);
    959			isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8
    960				 | (uint32_t) tail[6]<<16
    961				 | (uint32_t) tail[7]<<24);
    962			fprintf(stderr, "stored   checksum %08x isize %08x\n",
    963				cksum, isize);
    964
    965			if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
    966			    (total_out % (1ULL<<32))) {
    967				rc = 0;	goto ok1;
    968			} else {
    969				rc = -1; goto err4;
    970			}
    971		}
    972	} else
    973		goto read_state;
    974
    975	return -1;
    976
    977err1:
    978	fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
    979		expect, c);
    980	return -1;
    981
    982err2:
    983	fprintf(stderr, "error: the FLG byte is wrong or not being handled\n");
    984	return -1;
    985
    986err3:
    987	fprintf(stderr, "error: gzip header\n");
    988	return -1;
    989
    990err4:
    991	fprintf(stderr, "error: checksum missing or mismatch\n");
    992
    993err5:
    994ok1:
    995	fprintf(stderr, "decomp is complete: fclose\n");
    996	fclose(outf);
    997
    998	return rc;
    999}
   1000
   1001
   1002int main(int argc, char **argv)
   1003{
   1004	int rc;
   1005	struct sigaction act;
   1006	void *handle;
   1007
   1008	nx_dbg = 0;
   1009	nx_gzip_log = NULL;
   1010	act.sa_handler = 0;
   1011	act.sa_sigaction = nxu_sigsegv_handler;
   1012	act.sa_flags = SA_SIGINFO;
   1013	act.sa_restorer = 0;
   1014	sigemptyset(&act.sa_mask);
   1015	sigaction(SIGSEGV, &act, NULL);
   1016
   1017	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
   1018	if (!handle) {
   1019		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
   1020		exit(-1);
   1021	}
   1022
   1023	rc = decompress_file(argc, argv, handle);
   1024
   1025	nx_function_end(handle);
   1026
   1027	return rc;
   1028}