bch.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
bch.c (39266B)
      1/*
      2 * Generic binary BCH encoding/decoding library
      3 *
      4 * This program is free software; you can redistribute it and/or modify it
      5 * under the terms of the GNU General Public License version 2 as published by
      6 * the Free Software Foundation.
      7 *
      8 * This program is distributed in the hope that it will be useful, but WITHOUT
      9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     11 * more details.
     12 *
     13 * You should have received a copy of the GNU General Public License along with
     14 * this program; if not, write to the Free Software Foundation, Inc., 51
     15 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
     16 *
     17 * Copyright © 2011 Parrot S.A.
     18 *
     19 * Author: Ivan Djelic <ivan.djelic@parrot.com>
     20 *
     21 * Description:
     22 *
     23 * This library provides runtime configurable encoding/decoding of binary
     24 * Bose-Chaudhuri-Hocquenghem (BCH) codes.
     25 *
     26 * Call bch_init to get a pointer to a newly allocated bch_control structure for
     27 * the given m (Galois field order), t (error correction capability) and
     28 * (optional) primitive polynomial parameters.
     29 *
     30 * Call bch_encode to compute and store ecc parity bytes to a given buffer.
     31 * Call bch_decode to detect and locate errors in received data.
     32 *
     33 * On systems supporting hw BCH features, intermediate results may be provided
     34 * to bch_decode in order to skip certain steps. See bch_decode() documentation
     35 * for details.
     36 *
     37 * Option CONFIG_BCH_CONST_PARAMS can be used to force fixed values of
     38 * parameters m and t; thus allowing extra compiler optimizations and providing
     39 * better (up to 2x) encoding performance. Using this option makes sense when
     40 * (m,t) are fixed and known in advance, e.g. when using BCH error correction
     41 * on a particular NAND flash device.
     42 *
     43 * Algorithmic details:
     44 *
     45 * Encoding is performed by processing 32 input bits in parallel, using 4
     46 * remainder lookup tables.
     47 *
     48 * The final stage of decoding involves the following internal steps:
     49 * a. Syndrome computation
     50 * b. Error locator polynomial computation using Berlekamp-Massey algorithm
     51 * c. Error locator root finding (by far the most expensive step)
     52 *
     53 * In this implementation, step c is not performed using the usual Chien search.
     54 * Instead, an alternative approach described in [1] is used. It consists in
     55 * factoring the error locator polynomial using the Berlekamp Trace algorithm
     56 * (BTA) down to a certain degree (4), after which ad hoc low-degree polynomial
     57 * solving techniques [2] are used. The resulting algorithm, called BTZ, yields
     58 * much better performance than Chien search for usual (m,t) values (typically
     59 * m >= 13, t < 32, see [1]).
     60 *
     61 * [1] B. Biswas, V. Herbert. Efficient root finding of polynomials over fields
     62 * of characteristic 2, in: Western European Workshop on Research in Cryptology
     63 * - WEWoRC 2009, Graz, Austria, LNCS, Springer, July 2009, to appear.
     64 * [2] [Zin96] V.A. Zinoviev. On the solution of equations of degree 10 over
     65 * finite fields GF(2^q). In Rapport de recherche INRIA no 2829, 1996.
     66 */
     67
     68#include <linux/kernel.h>
     69#include <linux/errno.h>
     70#include <linux/init.h>
     71#include <linux/module.h>
     72#include <linux/slab.h>
     73#include <linux/bitops.h>
     74#include <asm/byteorder.h>
     75#include <linux/bch.h>
     76
     77#if defined(CONFIG_BCH_CONST_PARAMS)
     78#define GF_M(_p)               (CONFIG_BCH_CONST_M)
     79#define GF_T(_p)               (CONFIG_BCH_CONST_T)
     80#define GF_N(_p)               ((1 << (CONFIG_BCH_CONST_M))-1)
     81#define BCH_MAX_M              (CONFIG_BCH_CONST_M)
     82#define BCH_MAX_T	       (CONFIG_BCH_CONST_T)
     83#else
     84#define GF_M(_p)               ((_p)->m)
     85#define GF_T(_p)               ((_p)->t)
     86#define GF_N(_p)               ((_p)->n)
     87#define BCH_MAX_M              15 /* 2KB */
     88#define BCH_MAX_T              64 /* 64 bit correction */
     89#endif
     90
     91#define BCH_ECC_WORDS(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32)
     92#define BCH_ECC_BYTES(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 8)
     93
     94#define BCH_ECC_MAX_WORDS      DIV_ROUND_UP(BCH_MAX_M * BCH_MAX_T, 32)
     95
     96#ifndef dbg
     97#define dbg(_fmt, args...)     do {} while (0)
     98#endif
     99
    100/*
    101 * represent a polynomial over GF(2^m)
    102 */
    103struct gf_poly {
    104	unsigned int deg;    /* polynomial degree */
    105	unsigned int c[];   /* polynomial terms */
    106};
    107
    108/* given its degree, compute a polynomial size in bytes */
    109#define GF_POLY_SZ(_d) (sizeof(struct gf_poly)+((_d)+1)*sizeof(unsigned int))
    110
    111/* polynomial of degree 1 */
    112struct gf_poly_deg1 {
    113	struct gf_poly poly;
    114	unsigned int   c[2];
    115};
    116
    117static u8 swap_bits_table[] = {
    118	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
    119	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
    120	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
    121	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
    122	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
    123	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
    124	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
    125	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
    126	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
    127	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
    128	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
    129	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
    130	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
    131	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
    132	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
    133	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
    134	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
    135	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
    136	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
    137	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
    138	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
    139	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
    140	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
    141	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
    142	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
    143	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
    144	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
    145	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
    146	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
    147	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
    148	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
    149	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
    150};
    151
    152static u8 swap_bits(struct bch_control *bch, u8 in)
    153{
    154	if (!bch->swap_bits)
    155		return in;
    156
    157	return swap_bits_table[in];
    158}
    159
    160/*
    161 * same as bch_encode(), but process input data one byte at a time
    162 */
    163static void bch_encode_unaligned(struct bch_control *bch,
    164				 const unsigned char *data, unsigned int len,
    165				 uint32_t *ecc)
    166{
    167	int i;
    168	const uint32_t *p;
    169	const int l = BCH_ECC_WORDS(bch)-1;
    170
    171	while (len--) {
    172		u8 tmp = swap_bits(bch, *data++);
    173
    174		p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(tmp)) & 0xff);
    175
    176		for (i = 0; i < l; i++)
    177			ecc[i] = ((ecc[i] << 8)|(ecc[i+1] >> 24))^(*p++);
    178
    179		ecc[l] = (ecc[l] << 8)^(*p);
    180	}
    181}
    182
    183/*
    184 * convert ecc bytes to aligned, zero-padded 32-bit ecc words
    185 */
    186static void load_ecc8(struct bch_control *bch, uint32_t *dst,
    187		      const uint8_t *src)
    188{
    189	uint8_t pad[4] = {0, 0, 0, 0};
    190	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
    191
    192	for (i = 0; i < nwords; i++, src += 4)
    193		dst[i] = ((u32)swap_bits(bch, src[0]) << 24) |
    194			((u32)swap_bits(bch, src[1]) << 16) |
    195			((u32)swap_bits(bch, src[2]) << 8) |
    196			swap_bits(bch, src[3]);
    197
    198	memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords);
    199	dst[nwords] = ((u32)swap_bits(bch, pad[0]) << 24) |
    200		((u32)swap_bits(bch, pad[1]) << 16) |
    201		((u32)swap_bits(bch, pad[2]) << 8) |
    202		swap_bits(bch, pad[3]);
    203}
    204
    205/*
    206 * convert 32-bit ecc words to ecc bytes
    207 */
    208static void store_ecc8(struct bch_control *bch, uint8_t *dst,
    209		       const uint32_t *src)
    210{
    211	uint8_t pad[4];
    212	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
    213
    214	for (i = 0; i < nwords; i++) {
    215		*dst++ = swap_bits(bch, src[i] >> 24);
    216		*dst++ = swap_bits(bch, src[i] >> 16);
    217		*dst++ = swap_bits(bch, src[i] >> 8);
    218		*dst++ = swap_bits(bch, src[i]);
    219	}
    220	pad[0] = swap_bits(bch, src[nwords] >> 24);
    221	pad[1] = swap_bits(bch, src[nwords] >> 16);
    222	pad[2] = swap_bits(bch, src[nwords] >> 8);
    223	pad[3] = swap_bits(bch, src[nwords]);
    224	memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords);
    225}
    226
    227/**
    228 * bch_encode - calculate BCH ecc parity of data
    229 * @bch:   BCH control structure
    230 * @data:  data to encode
    231 * @len:   data length in bytes
    232 * @ecc:   ecc parity data, must be initialized by caller
    233 *
    234 * The @ecc parity array is used both as input and output parameter, in order to
    235 * allow incremental computations. It should be of the size indicated by member
    236 * @ecc_bytes of @bch, and should be initialized to 0 before the first call.
    237 *
    238 * The exact number of computed ecc parity bits is given by member @ecc_bits of
    239 * @bch; it may be less than m*t for large values of t.
    240 */
    241void bch_encode(struct bch_control *bch, const uint8_t *data,
    242		unsigned int len, uint8_t *ecc)
    243{
    244	const unsigned int l = BCH_ECC_WORDS(bch)-1;
    245	unsigned int i, mlen;
    246	unsigned long m;
    247	uint32_t w, r[BCH_ECC_MAX_WORDS];
    248	const size_t r_bytes = BCH_ECC_WORDS(bch) * sizeof(*r);
    249	const uint32_t * const tab0 = bch->mod8_tab;
    250	const uint32_t * const tab1 = tab0 + 256*(l+1);
    251	const uint32_t * const tab2 = tab1 + 256*(l+1);
    252	const uint32_t * const tab3 = tab2 + 256*(l+1);
    253	const uint32_t *pdata, *p0, *p1, *p2, *p3;
    254
    255	if (WARN_ON(r_bytes > sizeof(r)))
    256		return;
    257
    258	if (ecc) {
    259		/* load ecc parity bytes into internal 32-bit buffer */
    260		load_ecc8(bch, bch->ecc_buf, ecc);
    261	} else {
    262		memset(bch->ecc_buf, 0, r_bytes);
    263	}
    264
    265	/* process first unaligned data bytes */
    266	m = ((unsigned long)data) & 3;
    267	if (m) {
    268		mlen = (len < (4-m)) ? len : 4-m;
    269		bch_encode_unaligned(bch, data, mlen, bch->ecc_buf);
    270		data += mlen;
    271		len  -= mlen;
    272	}
    273
    274	/* process 32-bit aligned data words */
    275	pdata = (uint32_t *)data;
    276	mlen  = len/4;
    277	data += 4*mlen;
    278	len  -= 4*mlen;
    279	memcpy(r, bch->ecc_buf, r_bytes);
    280
    281	/*
    282	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
    283	 *
    284	 * 31 ...24  23 ...16  15 ... 8  7 ... 0
    285	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt
    286	 *                               tttttttt  mod g = r0 (precomputed)
    287	 *                     zzzzzzzz  00000000  mod g = r1 (precomputed)
    288	 *           yyyyyyyy  00000000  00000000  mod g = r2 (precomputed)
    289	 * xxxxxxxx  00000000  00000000  00000000  mod g = r3 (precomputed)
    290	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
    291	 */
    292	while (mlen--) {
    293		/* input data is read in big-endian format */
    294		w = cpu_to_be32(*pdata++);
    295		if (bch->swap_bits)
    296			w = (u32)swap_bits(bch, w) |
    297			    ((u32)swap_bits(bch, w >> 8) << 8) |
    298			    ((u32)swap_bits(bch, w >> 16) << 16) |
    299			    ((u32)swap_bits(bch, w >> 24) << 24);
    300		w ^= r[0];
    301		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
    302		p1 = tab1 + (l+1)*((w >>  8) & 0xff);
    303		p2 = tab2 + (l+1)*((w >> 16) & 0xff);
    304		p3 = tab3 + (l+1)*((w >> 24) & 0xff);
    305
    306		for (i = 0; i < l; i++)
    307			r[i] = r[i+1]^p0[i]^p1[i]^p2[i]^p3[i];
    308
    309		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
    310	}
    311	memcpy(bch->ecc_buf, r, r_bytes);
    312
    313	/* process last unaligned bytes */
    314	if (len)
    315		bch_encode_unaligned(bch, data, len, bch->ecc_buf);
    316
    317	/* store ecc parity bytes into original parity buffer */
    318	if (ecc)
    319		store_ecc8(bch, ecc, bch->ecc_buf);
    320}
    321EXPORT_SYMBOL_GPL(bch_encode);
    322
    323static inline int modulo(struct bch_control *bch, unsigned int v)
    324{
    325	const unsigned int n = GF_N(bch);
    326	while (v >= n) {
    327		v -= n;
    328		v = (v & n) + (v >> GF_M(bch));
    329	}
    330	return v;
    331}
    332
    333/*
    334 * shorter and faster modulo function, only works when v < 2N.
    335 */
    336static inline int mod_s(struct bch_control *bch, unsigned int v)
    337{
    338	const unsigned int n = GF_N(bch);
    339	return (v < n) ? v : v-n;
    340}
    341
    342static inline int deg(unsigned int poly)
    343{
    344	/* polynomial degree is the most-significant bit index */
    345	return fls(poly)-1;
    346}
    347
    348static inline int parity(unsigned int x)
    349{
    350	/*
    351	 * public domain code snippet, lifted from
    352	 * http://www-graphics.stanford.edu/~seander/bithacks.html
    353	 */
    354	x ^= x >> 1;
    355	x ^= x >> 2;
    356	x = (x & 0x11111111U) * 0x11111111U;
    357	return (x >> 28) & 1;
    358}
    359
    360/* Galois field basic operations: multiply, divide, inverse, etc. */
    361
    362static inline unsigned int gf_mul(struct bch_control *bch, unsigned int a,
    363				  unsigned int b)
    364{
    365	return (a && b) ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
    366					       bch->a_log_tab[b])] : 0;
    367}
    368
    369static inline unsigned int gf_sqr(struct bch_control *bch, unsigned int a)
    370{
    371	return a ? bch->a_pow_tab[mod_s(bch, 2*bch->a_log_tab[a])] : 0;
    372}
    373
    374static inline unsigned int gf_div(struct bch_control *bch, unsigned int a,
    375				  unsigned int b)
    376{
    377	return a ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
    378					GF_N(bch)-bch->a_log_tab[b])] : 0;
    379}
    380
    381static inline unsigned int gf_inv(struct bch_control *bch, unsigned int a)
    382{
    383	return bch->a_pow_tab[GF_N(bch)-bch->a_log_tab[a]];
    384}
    385
    386static inline unsigned int a_pow(struct bch_control *bch, int i)
    387{
    388	return bch->a_pow_tab[modulo(bch, i)];
    389}
    390
    391static inline int a_log(struct bch_control *bch, unsigned int x)
    392{
    393	return bch->a_log_tab[x];
    394}
    395
    396static inline int a_ilog(struct bch_control *bch, unsigned int x)
    397{
    398	return mod_s(bch, GF_N(bch)-bch->a_log_tab[x]);
    399}
    400
    401/*
    402 * compute 2t syndromes of ecc polynomial, i.e. ecc(a^j) for j=1..2t
    403 */
    404static void compute_syndromes(struct bch_control *bch, uint32_t *ecc,
    405			      unsigned int *syn)
    406{
    407	int i, j, s;
    408	unsigned int m;
    409	uint32_t poly;
    410	const int t = GF_T(bch);
    411
    412	s = bch->ecc_bits;
    413
    414	/* make sure extra bits in last ecc word are cleared */
    415	m = ((unsigned int)s) & 31;
    416	if (m)
    417		ecc[s/32] &= ~((1u << (32-m))-1);
    418	memset(syn, 0, 2*t*sizeof(*syn));
    419
    420	/* compute v(a^j) for j=1 .. 2t-1 */
    421	do {
    422		poly = *ecc++;
    423		s -= 32;
    424		while (poly) {
    425			i = deg(poly);
    426			for (j = 0; j < 2*t; j += 2)
    427				syn[j] ^= a_pow(bch, (j+1)*(i+s));
    428
    429			poly ^= (1 << i);
    430		}
    431	} while (s > 0);
    432
    433	/* v(a^(2j)) = v(a^j)^2 */
    434	for (j = 0; j < t; j++)
    435		syn[2*j+1] = gf_sqr(bch, syn[j]);
    436}
    437
    438static void gf_poly_copy(struct gf_poly *dst, struct gf_poly *src)
    439{
    440	memcpy(dst, src, GF_POLY_SZ(src->deg));
    441}
    442
    443static int compute_error_locator_polynomial(struct bch_control *bch,
    444					    const unsigned int *syn)
    445{
    446	const unsigned int t = GF_T(bch);
    447	const unsigned int n = GF_N(bch);
    448	unsigned int i, j, tmp, l, pd = 1, d = syn[0];
    449	struct gf_poly *elp = bch->elp;
    450	struct gf_poly *pelp = bch->poly_2t[0];
    451	struct gf_poly *elp_copy = bch->poly_2t[1];
    452	int k, pp = -1;
    453
    454	memset(pelp, 0, GF_POLY_SZ(2*t));
    455	memset(elp, 0, GF_POLY_SZ(2*t));
    456
    457	pelp->deg = 0;
    458	pelp->c[0] = 1;
    459	elp->deg = 0;
    460	elp->c[0] = 1;
    461
    462	/* use simplified binary Berlekamp-Massey algorithm */
    463	for (i = 0; (i < t) && (elp->deg <= t); i++) {
    464		if (d) {
    465			k = 2*i-pp;
    466			gf_poly_copy(elp_copy, elp);
    467			/* e[i+1](X) = e[i](X)+di*dp^-1*X^2(i-p)*e[p](X) */
    468			tmp = a_log(bch, d)+n-a_log(bch, pd);
    469			for (j = 0; j <= pelp->deg; j++) {
    470				if (pelp->c[j]) {
    471					l = a_log(bch, pelp->c[j]);
    472					elp->c[j+k] ^= a_pow(bch, tmp+l);
    473				}
    474			}
    475			/* compute l[i+1] = max(l[i]->c[l[p]+2*(i-p]) */
    476			tmp = pelp->deg+k;
    477			if (tmp > elp->deg) {
    478				elp->deg = tmp;
    479				gf_poly_copy(pelp, elp_copy);
    480				pd = d;
    481				pp = 2*i;
    482			}
    483		}
    484		/* di+1 = S(2i+3)+elp[i+1].1*S(2i+2)+...+elp[i+1].lS(2i+3-l) */
    485		if (i < t-1) {
    486			d = syn[2*i+2];
    487			for (j = 1; j <= elp->deg; j++)
    488				d ^= gf_mul(bch, elp->c[j], syn[2*i+2-j]);
    489		}
    490	}
    491	dbg("elp=%s\n", gf_poly_str(elp));
    492	return (elp->deg > t) ? -1 : (int)elp->deg;
    493}
    494
    495/*
    496 * solve a m x m linear system in GF(2) with an expected number of solutions,
    497 * and return the number of found solutions
    498 */
    499static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
    500			       unsigned int *sol, int nsol)
    501{
    502	const int m = GF_M(bch);
    503	unsigned int tmp, mask;
    504	int rem, c, r, p, k, param[BCH_MAX_M];
    505
    506	k = 0;
    507	mask = 1 << m;
    508
    509	/* Gaussian elimination */
    510	for (c = 0; c < m; c++) {
    511		rem = 0;
    512		p = c-k;
    513		/* find suitable row for elimination */
    514		for (r = p; r < m; r++) {
    515			if (rows[r] & mask) {
    516				if (r != p) {
    517					tmp = rows[r];
    518					rows[r] = rows[p];
    519					rows[p] = tmp;
    520				}
    521				rem = r+1;
    522				break;
    523			}
    524		}
    525		if (rem) {
    526			/* perform elimination on remaining rows */
    527			tmp = rows[p];
    528			for (r = rem; r < m; r++) {
    529				if (rows[r] & mask)
    530					rows[r] ^= tmp;
    531			}
    532		} else {
    533			/* elimination not needed, store defective row index */
    534			param[k++] = c;
    535		}
    536		mask >>= 1;
    537	}
    538	/* rewrite system, inserting fake parameter rows */
    539	if (k > 0) {
    540		p = k;
    541		for (r = m-1; r >= 0; r--) {
    542			if ((r > m-1-k) && rows[r])
    543				/* system has no solution */
    544				return 0;
    545
    546			rows[r] = (p && (r == param[p-1])) ?
    547				p--, 1u << (m-r) : rows[r-p];
    548		}
    549	}
    550
    551	if (nsol != (1 << k))
    552		/* unexpected number of solutions */
    553		return 0;
    554
    555	for (p = 0; p < nsol; p++) {
    556		/* set parameters for p-th solution */
    557		for (c = 0; c < k; c++)
    558			rows[param[c]] = (rows[param[c]] & ~1)|((p >> c) & 1);
    559
    560		/* compute unique solution */
    561		tmp = 0;
    562		for (r = m-1; r >= 0; r--) {
    563			mask = rows[r] & (tmp|1);
    564			tmp |= parity(mask) << (m-r);
    565		}
    566		sol[p] = tmp >> 1;
    567	}
    568	return nsol;
    569}
    570
    571/*
    572 * this function builds and solves a linear system for finding roots of a degree
    573 * 4 affine monic polynomial X^4+aX^2+bX+c over GF(2^m).
    574 */
    575static int find_affine4_roots(struct bch_control *bch, unsigned int a,
    576			      unsigned int b, unsigned int c,
    577			      unsigned int *roots)
    578{
    579	int i, j, k;
    580	const int m = GF_M(bch);
    581	unsigned int mask = 0xff, t, rows[16] = {0,};
    582
    583	j = a_log(bch, b);
    584	k = a_log(bch, a);
    585	rows[0] = c;
    586
    587	/* build linear system to solve X^4+aX^2+bX+c = 0 */
    588	for (i = 0; i < m; i++) {
    589		rows[i+1] = bch->a_pow_tab[4*i]^
    590			(a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^
    591			(b ? bch->a_pow_tab[mod_s(bch, j)] : 0);
    592		j++;
    593		k += 2;
    594	}
    595	/*
    596	 * transpose 16x16 matrix before passing it to linear solver
    597	 * warning: this code assumes m < 16
    598	 */
    599	for (j = 8; j != 0; j >>= 1, mask ^= (mask << j)) {
    600		for (k = 0; k < 16; k = (k+j+1) & ~j) {
    601			t = ((rows[k] >> j)^rows[k+j]) & mask;
    602			rows[k] ^= (t << j);
    603			rows[k+j] ^= t;
    604		}
    605	}
    606	return solve_linear_system(bch, rows, roots, 4);
    607}
    608
    609/*
    610 * compute root r of a degree 1 polynomial over GF(2^m) (returned as log(1/r))
    611 */
    612static int find_poly_deg1_roots(struct bch_control *bch, struct gf_poly *poly,
    613				unsigned int *roots)
    614{
    615	int n = 0;
    616
    617	if (poly->c[0])
    618		/* poly[X] = bX+c with c!=0, root=c/b */
    619		roots[n++] = mod_s(bch, GF_N(bch)-bch->a_log_tab[poly->c[0]]+
    620				   bch->a_log_tab[poly->c[1]]);
    621	return n;
    622}
    623
    624/*
    625 * compute roots of a degree 2 polynomial over GF(2^m)
    626 */
    627static int find_poly_deg2_roots(struct bch_control *bch, struct gf_poly *poly,
    628				unsigned int *roots)
    629{
    630	int n = 0, i, l0, l1, l2;
    631	unsigned int u, v, r;
    632
    633	if (poly->c[0] && poly->c[1]) {
    634
    635		l0 = bch->a_log_tab[poly->c[0]];
    636		l1 = bch->a_log_tab[poly->c[1]];
    637		l2 = bch->a_log_tab[poly->c[2]];
    638
    639		/* using z=a/bX, transform aX^2+bX+c into z^2+z+u (u=ac/b^2) */
    640		u = a_pow(bch, l0+l2+2*(GF_N(bch)-l1));
    641		/*
    642		 * let u = sum(li.a^i) i=0..m-1; then compute r = sum(li.xi):
    643		 * r^2+r = sum(li.(xi^2+xi)) = sum(li.(a^i+Tr(a^i).a^k)) =
    644		 * u + sum(li.Tr(a^i).a^k) = u+a^k.Tr(sum(li.a^i)) = u+a^k.Tr(u)
    645		 * i.e. r and r+1 are roots iff Tr(u)=0
    646		 */
    647		r = 0;
    648		v = u;
    649		while (v) {
    650			i = deg(v);
    651			r ^= bch->xi_tab[i];
    652			v ^= (1 << i);
    653		}
    654		/* verify root */
    655		if ((gf_sqr(bch, r)^r) == u) {
    656			/* reverse z=a/bX transformation and compute log(1/r) */
    657			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
    658					    bch->a_log_tab[r]+l2);
    659			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
    660					    bch->a_log_tab[r^1]+l2);
    661		}
    662	}
    663	return n;
    664}
    665
    666/*
    667 * compute roots of a degree 3 polynomial over GF(2^m)
    668 */
    669static int find_poly_deg3_roots(struct bch_control *bch, struct gf_poly *poly,
    670				unsigned int *roots)
    671{
    672	int i, n = 0;
    673	unsigned int a, b, c, a2, b2, c2, e3, tmp[4];
    674
    675	if (poly->c[0]) {
    676		/* transform polynomial into monic X^3 + a2X^2 + b2X + c2 */
    677		e3 = poly->c[3];
    678		c2 = gf_div(bch, poly->c[0], e3);
    679		b2 = gf_div(bch, poly->c[1], e3);
    680		a2 = gf_div(bch, poly->c[2], e3);
    681
    682		/* (X+a2)(X^3+a2X^2+b2X+c2) = X^4+aX^2+bX+c (affine) */
    683		c = gf_mul(bch, a2, c2);           /* c = a2c2      */
    684		b = gf_mul(bch, a2, b2)^c2;        /* b = a2b2 + c2 */
    685		a = gf_sqr(bch, a2)^b2;            /* a = a2^2 + b2 */
    686
    687		/* find the 4 roots of this affine polynomial */
    688		if (find_affine4_roots(bch, a, b, c, tmp) == 4) {
    689			/* remove a2 from final list of roots */
    690			for (i = 0; i < 4; i++) {
    691				if (tmp[i] != a2)
    692					roots[n++] = a_ilog(bch, tmp[i]);
    693			}
    694		}
    695	}
    696	return n;
    697}
    698
    699/*
    700 * compute roots of a degree 4 polynomial over GF(2^m)
    701 */
    702static int find_poly_deg4_roots(struct bch_control *bch, struct gf_poly *poly,
    703				unsigned int *roots)
    704{
    705	int i, l, n = 0;
    706	unsigned int a, b, c, d, e = 0, f, a2, b2, c2, e4;
    707
    708	if (poly->c[0] == 0)
    709		return 0;
    710
    711	/* transform polynomial into monic X^4 + aX^3 + bX^2 + cX + d */
    712	e4 = poly->c[4];
    713	d = gf_div(bch, poly->c[0], e4);
    714	c = gf_div(bch, poly->c[1], e4);
    715	b = gf_div(bch, poly->c[2], e4);
    716	a = gf_div(bch, poly->c[3], e4);
    717
    718	/* use Y=1/X transformation to get an affine polynomial */
    719	if (a) {
    720		/* first, eliminate cX by using z=X+e with ae^2+c=0 */
    721		if (c) {
    722			/* compute e such that e^2 = c/a */
    723			f = gf_div(bch, c, a);
    724			l = a_log(bch, f);
    725			l += (l & 1) ? GF_N(bch) : 0;
    726			e = a_pow(bch, l/2);
    727			/*
    728			 * use transformation z=X+e:
    729			 * z^4+e^4 + a(z^3+ez^2+e^2z+e^3) + b(z^2+e^2) +cz+ce+d
    730			 * z^4 + az^3 + (ae+b)z^2 + (ae^2+c)z+e^4+be^2+ae^3+ce+d
    731			 * z^4 + az^3 + (ae+b)z^2 + e^4+be^2+d
    732			 * z^4 + az^3 +     b'z^2 + d'
    733			 */
    734			d = a_pow(bch, 2*l)^gf_mul(bch, b, f)^d;
    735			b = gf_mul(bch, a, e)^b;
    736		}
    737		/* now, use Y=1/X to get Y^4 + b/dY^2 + a/dY + 1/d */
    738		if (d == 0)
    739			/* assume all roots have multiplicity 1 */
    740			return 0;
    741
    742		c2 = gf_inv(bch, d);
    743		b2 = gf_div(bch, a, d);
    744		a2 = gf_div(bch, b, d);
    745	} else {
    746		/* polynomial is already affine */
    747		c2 = d;
    748		b2 = c;
    749		a2 = b;
    750	}
    751	/* find the 4 roots of this affine polynomial */
    752	if (find_affine4_roots(bch, a2, b2, c2, roots) == 4) {
    753		for (i = 0; i < 4; i++) {
    754			/* post-process roots (reverse transformations) */
    755			f = a ? gf_inv(bch, roots[i]) : roots[i];
    756			roots[i] = a_ilog(bch, f^e);
    757		}
    758		n = 4;
    759	}
    760	return n;
    761}
    762
    763/*
    764 * build monic, log-based representation of a polynomial
    765 */
    766static void gf_poly_logrep(struct bch_control *bch,
    767			   const struct gf_poly *a, int *rep)
    768{
    769	int i, d = a->deg, l = GF_N(bch)-a_log(bch, a->c[a->deg]);
    770
    771	/* represent 0 values with -1; warning, rep[d] is not set to 1 */
    772	for (i = 0; i < d; i++)
    773		rep[i] = a->c[i] ? mod_s(bch, a_log(bch, a->c[i])+l) : -1;
    774}
    775
    776/*
    777 * compute polynomial Euclidean division remainder in GF(2^m)[X]
    778 */
    779static void gf_poly_mod(struct bch_control *bch, struct gf_poly *a,
    780			const struct gf_poly *b, int *rep)
    781{
    782	int la, p, m;
    783	unsigned int i, j, *c = a->c;
    784	const unsigned int d = b->deg;
    785
    786	if (a->deg < d)
    787		return;
    788
    789	/* reuse or compute log representation of denominator */
    790	if (!rep) {
    791		rep = bch->cache;
    792		gf_poly_logrep(bch, b, rep);
    793	}
    794
    795	for (j = a->deg; j >= d; j--) {
    796		if (c[j]) {
    797			la = a_log(bch, c[j]);
    798			p = j-d;
    799			for (i = 0; i < d; i++, p++) {
    800				m = rep[i];
    801				if (m >= 0)
    802					c[p] ^= bch->a_pow_tab[mod_s(bch,
    803								     m+la)];
    804			}
    805		}
    806	}
    807	a->deg = d-1;
    808	while (!c[a->deg] && a->deg)
    809		a->deg--;
    810}
    811
    812/*
    813 * compute polynomial Euclidean division quotient in GF(2^m)[X]
    814 */
    815static void gf_poly_div(struct bch_control *bch, struct gf_poly *a,
    816			const struct gf_poly *b, struct gf_poly *q)
    817{
    818	if (a->deg >= b->deg) {
    819		q->deg = a->deg-b->deg;
    820		/* compute a mod b (modifies a) */
    821		gf_poly_mod(bch, a, b, NULL);
    822		/* quotient is stored in upper part of polynomial a */
    823		memcpy(q->c, &a->c[b->deg], (1+q->deg)*sizeof(unsigned int));
    824	} else {
    825		q->deg = 0;
    826		q->c[0] = 0;
    827	}
    828}
    829
    830/*
    831 * compute polynomial GCD (Greatest Common Divisor) in GF(2^m)[X]
    832 */
    833static struct gf_poly *gf_poly_gcd(struct bch_control *bch, struct gf_poly *a,
    834				   struct gf_poly *b)
    835{
    836	struct gf_poly *tmp;
    837
    838	dbg("gcd(%s,%s)=", gf_poly_str(a), gf_poly_str(b));
    839
    840	if (a->deg < b->deg) {
    841		tmp = b;
    842		b = a;
    843		a = tmp;
    844	}
    845
    846	while (b->deg > 0) {
    847		gf_poly_mod(bch, a, b, NULL);
    848		tmp = b;
    849		b = a;
    850		a = tmp;
    851	}
    852
    853	dbg("%s\n", gf_poly_str(a));
    854
    855	return a;
    856}
    857
    858/*
    859 * Given a polynomial f and an integer k, compute Tr(a^kX) mod f
    860 * This is used in Berlekamp Trace algorithm for splitting polynomials
    861 */
    862static void compute_trace_bk_mod(struct bch_control *bch, int k,
    863				 const struct gf_poly *f, struct gf_poly *z,
    864				 struct gf_poly *out)
    865{
    866	const int m = GF_M(bch);
    867	int i, j;
    868
    869	/* z contains z^2j mod f */
    870	z->deg = 1;
    871	z->c[0] = 0;
    872	z->c[1] = bch->a_pow_tab[k];
    873
    874	out->deg = 0;
    875	memset(out, 0, GF_POLY_SZ(f->deg));
    876
    877	/* compute f log representation only once */
    878	gf_poly_logrep(bch, f, bch->cache);
    879
    880	for (i = 0; i < m; i++) {
    881		/* add a^(k*2^i)(z^(2^i) mod f) and compute (z^(2^i) mod f)^2 */
    882		for (j = z->deg; j >= 0; j--) {
    883			out->c[j] ^= z->c[j];
    884			z->c[2*j] = gf_sqr(bch, z->c[j]);
    885			z->c[2*j+1] = 0;
    886		}
    887		if (z->deg > out->deg)
    888			out->deg = z->deg;
    889
    890		if (i < m-1) {
    891			z->deg *= 2;
    892			/* z^(2(i+1)) mod f = (z^(2^i) mod f)^2 mod f */
    893			gf_poly_mod(bch, z, f, bch->cache);
    894		}
    895	}
    896	while (!out->c[out->deg] && out->deg)
    897		out->deg--;
    898
    899	dbg("Tr(a^%d.X) mod f = %s\n", k, gf_poly_str(out));
    900}
    901
    902/*
    903 * factor a polynomial using Berlekamp Trace algorithm (BTA)
    904 */
    905static void factor_polynomial(struct bch_control *bch, int k, struct gf_poly *f,
    906			      struct gf_poly **g, struct gf_poly **h)
    907{
    908	struct gf_poly *f2 = bch->poly_2t[0];
    909	struct gf_poly *q  = bch->poly_2t[1];
    910	struct gf_poly *tk = bch->poly_2t[2];
    911	struct gf_poly *z  = bch->poly_2t[3];
    912	struct gf_poly *gcd;
    913
    914	dbg("factoring %s...\n", gf_poly_str(f));
    915
    916	*g = f;
    917	*h = NULL;
    918
    919	/* tk = Tr(a^k.X) mod f */
    920	compute_trace_bk_mod(bch, k, f, z, tk);
    921
    922	if (tk->deg > 0) {
    923		/* compute g = gcd(f, tk) (destructive operation) */
    924		gf_poly_copy(f2, f);
    925		gcd = gf_poly_gcd(bch, f2, tk);
    926		if (gcd->deg < f->deg) {
    927			/* compute h=f/gcd(f,tk); this will modify f and q */
    928			gf_poly_div(bch, f, gcd, q);
    929			/* store g and h in-place (clobbering f) */
    930			*h = &((struct gf_poly_deg1 *)f)[gcd->deg].poly;
    931			gf_poly_copy(*g, gcd);
    932			gf_poly_copy(*h, q);
    933		}
    934	}
    935}
    936
    937/*
    938 * find roots of a polynomial, using BTZ algorithm; see the beginning of this
    939 * file for details
    940 */
    941static int find_poly_roots(struct bch_control *bch, unsigned int k,
    942			   struct gf_poly *poly, unsigned int *roots)
    943{
    944	int cnt;
    945	struct gf_poly *f1, *f2;
    946
    947	switch (poly->deg) {
    948		/* handle low degree polynomials with ad hoc techniques */
    949	case 1:
    950		cnt = find_poly_deg1_roots(bch, poly, roots);
    951		break;
    952	case 2:
    953		cnt = find_poly_deg2_roots(bch, poly, roots);
    954		break;
    955	case 3:
    956		cnt = find_poly_deg3_roots(bch, poly, roots);
    957		break;
    958	case 4:
    959		cnt = find_poly_deg4_roots(bch, poly, roots);
    960		break;
    961	default:
    962		/* factor polynomial using Berlekamp Trace Algorithm (BTA) */
    963		cnt = 0;
    964		if (poly->deg && (k <= GF_M(bch))) {
    965			factor_polynomial(bch, k, poly, &f1, &f2);
    966			if (f1)
    967				cnt += find_poly_roots(bch, k+1, f1, roots);
    968			if (f2)
    969				cnt += find_poly_roots(bch, k+1, f2, roots+cnt);
    970		}
    971		break;
    972	}
    973	return cnt;
    974}
    975
    976#if defined(USE_CHIEN_SEARCH)
    977/*
    978 * exhaustive root search (Chien) implementation - not used, included only for
    979 * reference/comparison tests
    980 */
    981static int chien_search(struct bch_control *bch, unsigned int len,
    982			struct gf_poly *p, unsigned int *roots)
    983{
    984	int m;
    985	unsigned int i, j, syn, syn0, count = 0;
    986	const unsigned int k = 8*len+bch->ecc_bits;
    987
    988	/* use a log-based representation of polynomial */
    989	gf_poly_logrep(bch, p, bch->cache);
    990	bch->cache[p->deg] = 0;
    991	syn0 = gf_div(bch, p->c[0], p->c[p->deg]);
    992
    993	for (i = GF_N(bch)-k+1; i <= GF_N(bch); i++) {
    994		/* compute elp(a^i) */
    995		for (j = 1, syn = syn0; j <= p->deg; j++) {
    996			m = bch->cache[j];
    997			if (m >= 0)
    998				syn ^= a_pow(bch, m+j*i);
    999		}
   1000		if (syn == 0) {
   1001			roots[count++] = GF_N(bch)-i;
   1002			if (count == p->deg)
   1003				break;
   1004		}
   1005	}
   1006	return (count == p->deg) ? count : 0;
   1007}
   1008#define find_poly_roots(_p, _k, _elp, _loc) chien_search(_p, len, _elp, _loc)
   1009#endif /* USE_CHIEN_SEARCH */
   1010
   1011/**
   1012 * bch_decode - decode received codeword and find bit error locations
   1013 * @bch:      BCH control structure
   1014 * @data:     received data, ignored if @calc_ecc is provided
   1015 * @len:      data length in bytes, must always be provided
   1016 * @recv_ecc: received ecc, if NULL then assume it was XORed in @calc_ecc
   1017 * @calc_ecc: calculated ecc, if NULL then calc_ecc is computed from @data
   1018 * @syn:      hw computed syndrome data (if NULL, syndrome is calculated)
   1019 * @errloc:   output array of error locations
   1020 *
   1021 * Returns:
   1022 *  The number of errors found, or -EBADMSG if decoding failed, or -EINVAL if
   1023 *  invalid parameters were provided
   1024 *
   1025 * Depending on the available hw BCH support and the need to compute @calc_ecc
   1026 * separately (using bch_encode()), this function should be called with one of
   1027 * the following parameter configurations -
   1028 *
   1029 * by providing @data and @recv_ecc only:
   1030 *   bch_decode(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc)
   1031 *
   1032 * by providing @recv_ecc and @calc_ecc:
   1033 *   bch_decode(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc)
   1034 *
   1035 * by providing ecc = recv_ecc XOR calc_ecc:
   1036 *   bch_decode(@bch, NULL, @len, NULL, ecc, NULL, @errloc)
   1037 *
   1038 * by providing syndrome results @syn:
   1039 *   bch_decode(@bch, NULL, @len, NULL, NULL, @syn, @errloc)
   1040 *
   1041 * Once bch_decode() has successfully returned with a positive value, error
   1042 * locations returned in array @errloc should be interpreted as follows -
   1043 *
   1044 * if (errloc[n] >= 8*len), then n-th error is located in ecc (no need for
   1045 * data correction)
   1046 *
   1047 * if (errloc[n] < 8*len), then n-th error is located in data and can be
   1048 * corrected with statement data[errloc[n]/8] ^= 1 << (errloc[n] % 8);
   1049 *
   1050 * Note that this function does not perform any data correction by itself, it
   1051 * merely indicates error locations.
   1052 */
   1053int bch_decode(struct bch_control *bch, const uint8_t *data, unsigned int len,
   1054	       const uint8_t *recv_ecc, const uint8_t *calc_ecc,
   1055	       const unsigned int *syn, unsigned int *errloc)
   1056{
   1057	const unsigned int ecc_words = BCH_ECC_WORDS(bch);
   1058	unsigned int nbits;
   1059	int i, err, nroots;
   1060	uint32_t sum;
   1061
   1062	/* sanity check: make sure data length can be handled */
   1063	if (8*len > (bch->n-bch->ecc_bits))
   1064		return -EINVAL;
   1065
   1066	/* if caller does not provide syndromes, compute them */
   1067	if (!syn) {
   1068		if (!calc_ecc) {
   1069			/* compute received data ecc into an internal buffer */
   1070			if (!data || !recv_ecc)
   1071				return -EINVAL;
   1072			bch_encode(bch, data, len, NULL);
   1073		} else {
   1074			/* load provided calculated ecc */
   1075			load_ecc8(bch, bch->ecc_buf, calc_ecc);
   1076		}
   1077		/* load received ecc or assume it was XORed in calc_ecc */
   1078		if (recv_ecc) {
   1079			load_ecc8(bch, bch->ecc_buf2, recv_ecc);
   1080			/* XOR received and calculated ecc */
   1081			for (i = 0, sum = 0; i < (int)ecc_words; i++) {
   1082				bch->ecc_buf[i] ^= bch->ecc_buf2[i];
   1083				sum |= bch->ecc_buf[i];
   1084			}
   1085			if (!sum)
   1086				/* no error found */
   1087				return 0;
   1088		}
   1089		compute_syndromes(bch, bch->ecc_buf, bch->syn);
   1090		syn = bch->syn;
   1091	}
   1092
   1093	err = compute_error_locator_polynomial(bch, syn);
   1094	if (err > 0) {
   1095		nroots = find_poly_roots(bch, 1, bch->elp, errloc);
   1096		if (err != nroots)
   1097			err = -1;
   1098	}
   1099	if (err > 0) {
   1100		/* post-process raw error locations for easier correction */
   1101		nbits = (len*8)+bch->ecc_bits;
   1102		for (i = 0; i < err; i++) {
   1103			if (errloc[i] >= nbits) {
   1104				err = -1;
   1105				break;
   1106			}
   1107			errloc[i] = nbits-1-errloc[i];
   1108			if (!bch->swap_bits)
   1109				errloc[i] = (errloc[i] & ~7) |
   1110					    (7-(errloc[i] & 7));
   1111		}
   1112	}
   1113	return (err >= 0) ? err : -EBADMSG;
   1114}
   1115EXPORT_SYMBOL_GPL(bch_decode);
   1116
   1117/*
   1118 * generate Galois field lookup tables
   1119 */
   1120static int build_gf_tables(struct bch_control *bch, unsigned int poly)
   1121{
   1122	unsigned int i, x = 1;
   1123	const unsigned int k = 1 << deg(poly);
   1124
   1125	/* primitive polynomial must be of degree m */
   1126	if (k != (1u << GF_M(bch)))
   1127		return -1;
   1128
   1129	for (i = 0; i < GF_N(bch); i++) {
   1130		bch->a_pow_tab[i] = x;
   1131		bch->a_log_tab[x] = i;
   1132		if (i && (x == 1))
   1133			/* polynomial is not primitive (a^i=1 with 0<i<2^m-1) */
   1134			return -1;
   1135		x <<= 1;
   1136		if (x & k)
   1137			x ^= poly;
   1138	}
   1139	bch->a_pow_tab[GF_N(bch)] = 1;
   1140	bch->a_log_tab[0] = 0;
   1141
   1142	return 0;
   1143}
   1144
   1145/*
   1146 * compute generator polynomial remainder tables for fast encoding
   1147 */
   1148static void build_mod8_tables(struct bch_control *bch, const uint32_t *g)
   1149{
   1150	int i, j, b, d;
   1151	uint32_t data, hi, lo, *tab;
   1152	const int l = BCH_ECC_WORDS(bch);
   1153	const int plen = DIV_ROUND_UP(bch->ecc_bits+1, 32);
   1154	const int ecclen = DIV_ROUND_UP(bch->ecc_bits, 32);
   1155
   1156	memset(bch->mod8_tab, 0, 4*256*l*sizeof(*bch->mod8_tab));
   1157
   1158	for (i = 0; i < 256; i++) {
   1159		/* p(X)=i is a small polynomial of weight <= 8 */
   1160		for (b = 0; b < 4; b++) {
   1161			/* we want to compute (p(X).X^(8*b+deg(g))) mod g(X) */
   1162			tab = bch->mod8_tab + (b*256+i)*l;
   1163			data = i << (8*b);
   1164			while (data) {
   1165				d = deg(data);
   1166				/* subtract X^d.g(X) from p(X).X^(8*b+deg(g)) */
   1167				data ^= g[0] >> (31-d);
   1168				for (j = 0; j < ecclen; j++) {
   1169					hi = (d < 31) ? g[j] << (d+1) : 0;
   1170					lo = (j+1 < plen) ?
   1171						g[j+1] >> (31-d) : 0;
   1172					tab[j] ^= hi|lo;
   1173				}
   1174			}
   1175		}
   1176	}
   1177}
   1178
   1179/*
   1180 * build a base for factoring degree 2 polynomials
   1181 */
   1182static int build_deg2_base(struct bch_control *bch)
   1183{
   1184	const int m = GF_M(bch);
   1185	int i, j, r;
   1186	unsigned int sum, x, y, remaining, ak = 0, xi[BCH_MAX_M];
   1187
   1188	/* find k s.t. Tr(a^k) = 1 and 0 <= k < m */
   1189	for (i = 0; i < m; i++) {
   1190		for (j = 0, sum = 0; j < m; j++)
   1191			sum ^= a_pow(bch, i*(1 << j));
   1192
   1193		if (sum) {
   1194			ak = bch->a_pow_tab[i];
   1195			break;
   1196		}
   1197	}
   1198	/* find xi, i=0..m-1 such that xi^2+xi = a^i+Tr(a^i).a^k */
   1199	remaining = m;
   1200	memset(xi, 0, sizeof(xi));
   1201
   1202	for (x = 0; (x <= GF_N(bch)) && remaining; x++) {
   1203		y = gf_sqr(bch, x)^x;
   1204		for (i = 0; i < 2; i++) {
   1205			r = a_log(bch, y);
   1206			if (y && (r < m) && !xi[r]) {
   1207				bch->xi_tab[r] = x;
   1208				xi[r] = 1;
   1209				remaining--;
   1210				dbg("x%d = %x\n", r, x);
   1211				break;
   1212			}
   1213			y ^= ak;
   1214		}
   1215	}
   1216	/* should not happen but check anyway */
   1217	return remaining ? -1 : 0;
   1218}
   1219
   1220static void *bch_alloc(size_t size, int *err)
   1221{
   1222	void *ptr;
   1223
   1224	ptr = kmalloc(size, GFP_KERNEL);
   1225	if (ptr == NULL)
   1226		*err = 1;
   1227	return ptr;
   1228}
   1229
   1230/*
   1231 * compute generator polynomial for given (m,t) parameters.
   1232 */
   1233static uint32_t *compute_generator_polynomial(struct bch_control *bch)
   1234{
   1235	const unsigned int m = GF_M(bch);
   1236	const unsigned int t = GF_T(bch);
   1237	int n, err = 0;
   1238	unsigned int i, j, nbits, r, word, *roots;
   1239	struct gf_poly *g;
   1240	uint32_t *genpoly;
   1241
   1242	g = bch_alloc(GF_POLY_SZ(m*t), &err);
   1243	roots = bch_alloc((bch->n+1)*sizeof(*roots), &err);
   1244	genpoly = bch_alloc(DIV_ROUND_UP(m*t+1, 32)*sizeof(*genpoly), &err);
   1245
   1246	if (err) {
   1247		kfree(genpoly);
   1248		genpoly = NULL;
   1249		goto finish;
   1250	}
   1251
   1252	/* enumerate all roots of g(X) */
   1253	memset(roots , 0, (bch->n+1)*sizeof(*roots));
   1254	for (i = 0; i < t; i++) {
   1255		for (j = 0, r = 2*i+1; j < m; j++) {
   1256			roots[r] = 1;
   1257			r = mod_s(bch, 2*r);
   1258		}
   1259	}
   1260	/* build generator polynomial g(X) */
   1261	g->deg = 0;
   1262	g->c[0] = 1;
   1263	for (i = 0; i < GF_N(bch); i++) {
   1264		if (roots[i]) {
   1265			/* multiply g(X) by (X+root) */
   1266			r = bch->a_pow_tab[i];
   1267			g->c[g->deg+1] = 1;
   1268			for (j = g->deg; j > 0; j--)
   1269				g->c[j] = gf_mul(bch, g->c[j], r)^g->c[j-1];
   1270
   1271			g->c[0] = gf_mul(bch, g->c[0], r);
   1272			g->deg++;
   1273		}
   1274	}
   1275	/* store left-justified binary representation of g(X) */
   1276	n = g->deg+1;
   1277	i = 0;
   1278
   1279	while (n > 0) {
   1280		nbits = (n > 32) ? 32 : n;
   1281		for (j = 0, word = 0; j < nbits; j++) {
   1282			if (g->c[n-1-j])
   1283				word |= 1u << (31-j);
   1284		}
   1285		genpoly[i++] = word;
   1286		n -= nbits;
   1287	}
   1288	bch->ecc_bits = g->deg;
   1289
   1290finish:
   1291	kfree(g);
   1292	kfree(roots);
   1293
   1294	return genpoly;
   1295}
   1296
   1297/**
   1298 * bch_init - initialize a BCH encoder/decoder
   1299 * @m:          Galois field order, should be in the range 5-15
   1300 * @t:          maximum error correction capability, in bits
   1301 * @prim_poly:  user-provided primitive polynomial (or 0 to use default)
   1302 * @swap_bits:  swap bits within data and syndrome bytes
   1303 *
   1304 * Returns:
   1305 *  a newly allocated BCH control structure if successful, NULL otherwise
   1306 *
   1307 * This initialization can take some time, as lookup tables are built for fast
   1308 * encoding/decoding; make sure not to call this function from a time critical
   1309 * path. Usually, bch_init() should be called on module/driver init and
   1310 * bch_free() should be called to release memory on exit.
   1311 *
   1312 * You may provide your own primitive polynomial of degree @m in argument
   1313 * @prim_poly, or let bch_init() use its default polynomial.
   1314 *
   1315 * Once bch_init() has successfully returned a pointer to a newly allocated
   1316 * BCH control structure, ecc length in bytes is given by member @ecc_bytes of
   1317 * the structure.
   1318 */
   1319struct bch_control *bch_init(int m, int t, unsigned int prim_poly,
   1320			     bool swap_bits)
   1321{
   1322	int err = 0;
   1323	unsigned int i, words;
   1324	uint32_t *genpoly;
   1325	struct bch_control *bch = NULL;
   1326
   1327	const int min_m = 5;
   1328
   1329	/* default primitive polynomials */
   1330	static const unsigned int prim_poly_tab[] = {
   1331		0x25, 0x43, 0x83, 0x11d, 0x211, 0x409, 0x805, 0x1053, 0x201b,
   1332		0x402b, 0x8003,
   1333	};
   1334
   1335#if defined(CONFIG_BCH_CONST_PARAMS)
   1336	if ((m != (CONFIG_BCH_CONST_M)) || (t != (CONFIG_BCH_CONST_T))) {
   1337		printk(KERN_ERR "bch encoder/decoder was configured to support "
   1338		       "parameters m=%d, t=%d only!\n",
   1339		       CONFIG_BCH_CONST_M, CONFIG_BCH_CONST_T);
   1340		goto fail;
   1341	}
   1342#endif
   1343	if ((m < min_m) || (m > BCH_MAX_M))
   1344		/*
   1345		 * values of m greater than 15 are not currently supported;
   1346		 * supporting m > 15 would require changing table base type
   1347		 * (uint16_t) and a small patch in matrix transposition
   1348		 */
   1349		goto fail;
   1350
   1351	if (t > BCH_MAX_T)
   1352		/*
   1353		 * we can support larger than 64 bits if necessary, at the
   1354		 * cost of higher stack usage.
   1355		 */
   1356		goto fail;
   1357
   1358	/* sanity checks */
   1359	if ((t < 1) || (m*t >= ((1 << m)-1)))
   1360		/* invalid t value */
   1361		goto fail;
   1362
   1363	/* select a primitive polynomial for generating GF(2^m) */
   1364	if (prim_poly == 0)
   1365		prim_poly = prim_poly_tab[m-min_m];
   1366
   1367	bch = kzalloc(sizeof(*bch), GFP_KERNEL);
   1368	if (bch == NULL)
   1369		goto fail;
   1370
   1371	bch->m = m;
   1372	bch->t = t;
   1373	bch->n = (1 << m)-1;
   1374	words  = DIV_ROUND_UP(m*t, 32);
   1375	bch->ecc_bytes = DIV_ROUND_UP(m*t, 8);
   1376	bch->a_pow_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_pow_tab), &err);
   1377	bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err);
   1378	bch->mod8_tab  = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err);
   1379	bch->ecc_buf   = bch_alloc(words*sizeof(*bch->ecc_buf), &err);
   1380	bch->ecc_buf2  = bch_alloc(words*sizeof(*bch->ecc_buf2), &err);
   1381	bch->xi_tab    = bch_alloc(m*sizeof(*bch->xi_tab), &err);
   1382	bch->syn       = bch_alloc(2*t*sizeof(*bch->syn), &err);
   1383	bch->cache     = bch_alloc(2*t*sizeof(*bch->cache), &err);
   1384	bch->elp       = bch_alloc((t+1)*sizeof(struct gf_poly_deg1), &err);
   1385	bch->swap_bits = swap_bits;
   1386
   1387	for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
   1388		bch->poly_2t[i] = bch_alloc(GF_POLY_SZ(2*t), &err);
   1389
   1390	if (err)
   1391		goto fail;
   1392
   1393	err = build_gf_tables(bch, prim_poly);
   1394	if (err)
   1395		goto fail;
   1396
   1397	/* use generator polynomial for computing encoding tables */
   1398	genpoly = compute_generator_polynomial(bch);
   1399	if (genpoly == NULL)
   1400		goto fail;
   1401
   1402	build_mod8_tables(bch, genpoly);
   1403	kfree(genpoly);
   1404
   1405	err = build_deg2_base(bch);
   1406	if (err)
   1407		goto fail;
   1408
   1409	return bch;
   1410
   1411fail:
   1412	bch_free(bch);
   1413	return NULL;
   1414}
   1415EXPORT_SYMBOL_GPL(bch_init);
   1416
   1417/**
   1418 *  bch_free - free the BCH control structure
   1419 *  @bch:    BCH control structure to release
   1420 */
   1421void bch_free(struct bch_control *bch)
   1422{
   1423	unsigned int i;
   1424
   1425	if (bch) {
   1426		kfree(bch->a_pow_tab);
   1427		kfree(bch->a_log_tab);
   1428		kfree(bch->mod8_tab);
   1429		kfree(bch->ecc_buf);
   1430		kfree(bch->ecc_buf2);
   1431		kfree(bch->xi_tab);
   1432		kfree(bch->syn);
   1433		kfree(bch->cache);
   1434		kfree(bch->elp);
   1435
   1436		for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
   1437			kfree(bch->poly_2t[i]);
   1438
   1439		kfree(bch);
   1440	}
   1441}
   1442EXPORT_SYMBOL_GPL(bch_free);
   1443
   1444MODULE_LICENSE("GPL");
   1445MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
   1446MODULE_DESCRIPTION("Binary BCH encoder/decoder");