cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

crc32-vpmsum_core.S (14207B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Core of the accelerated CRC algorithm.
      4 * In your file, define the constants and CRC_FUNCTION_NAME
      5 * Then include this file.
      6 *
      7 * Calculate the checksum of data that is 16 byte aligned and a multiple of
      8 * 16 bytes.
      9 *
     10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
     11 * chunks in order to mask the latency of the vpmsum instructions. If we
     12 * have more than 32 kB of data to checksum we repeat this step multiple
     13 * times, passing in the previous 1024 bits.
     14 *
     15 * The next step is to reduce the 1024 bits to 64 bits. This step adds
     16 * 32 bits of 0s to the end - this matches what a CRC does. We just
     17 * calculate constants that land the data in this 32 bits.
     18 *
     19 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
     20 * for n = CRC using POWER8 instructions. We use x = 32.
     21 *
     22 * https://en.wikipedia.org/wiki/Barrett_reduction
     23 *
     24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
     25*/
     26
     27#include <asm/ppc_asm.h>
     28#include <asm/ppc-opcode.h>
     29
     30#define MAX_SIZE	32768
     31
     32	.text
     33
     34#if defined(__BIG_ENDIAN__) && defined(REFLECT)
     35#define BYTESWAP_DATA
     36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
     37#define BYTESWAP_DATA
     38#else
     39#undef BYTESWAP_DATA
     40#endif
     41
     42#define off16		r25
     43#define off32		r26
     44#define off48		r27
     45#define off64		r28
     46#define off80		r29
     47#define off96		r30
     48#define off112		r31
     49
     50#define const1		v24
     51#define const2		v25
     52
     53#define byteswap	v26
     54#define	mask_32bit	v27
     55#define	mask_64bit	v28
     56#define zeroes		v29
     57
     58#ifdef BYTESWAP_DATA
     59#define VPERM(A, B, C, D) vperm	A, B, C, D
     60#else
     61#define VPERM(A, B, C, D)
     62#endif
     63
     64/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
     65FUNC_START(CRC_FUNCTION_NAME)
     66	std	r31,-8(r1)
     67	std	r30,-16(r1)
     68	std	r29,-24(r1)
     69	std	r28,-32(r1)
     70	std	r27,-40(r1)
     71	std	r26,-48(r1)
     72	std	r25,-56(r1)
     73
     74	li	off16,16
     75	li	off32,32
     76	li	off48,48
     77	li	off64,64
     78	li	off80,80
     79	li	off96,96
     80	li	off112,112
     81	li	r0,0
     82
     83	/* Enough room for saving 10 non volatile VMX registers */
     84	subi	r6,r1,56+10*16
     85	subi	r7,r1,56+2*16
     86
     87	stvx	v20,0,r6
     88	stvx	v21,off16,r6
     89	stvx	v22,off32,r6
     90	stvx	v23,off48,r6
     91	stvx	v24,off64,r6
     92	stvx	v25,off80,r6
     93	stvx	v26,off96,r6
     94	stvx	v27,off112,r6
     95	stvx	v28,0,r7
     96	stvx	v29,off16,r7
     97
     98	mr	r10,r3
     99
    100	vxor	zeroes,zeroes,zeroes
    101	vspltisw v0,-1
    102
    103	vsldoi	mask_32bit,zeroes,v0,4
    104	vsldoi	mask_64bit,zeroes,v0,8
    105
    106	/* Get the initial value into v8 */
    107	vxor	v8,v8,v8
    108	MTVRD(v8, R3)
    109#ifdef REFLECT
    110	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
    111#else
    112	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
    113#endif
    114
    115#ifdef BYTESWAP_DATA
    116	addis	r3,r2,.byteswap_constant@toc@ha
    117	addi	r3,r3,.byteswap_constant@toc@l
    118
    119	lvx	byteswap,0,r3
    120	addi	r3,r3,16
    121#endif
    122
    123	cmpdi	r5,256
    124	blt	.Lshort
    125
    126	rldicr	r6,r5,0,56
    127
    128	/* Checksum in blocks of MAX_SIZE */
    1291:	lis	r7,MAX_SIZE@h
    130	ori	r7,r7,MAX_SIZE@l
    131	mr	r9,r7
    132	cmpd	r6,r7
    133	bgt	2f
    134	mr	r7,r6
    1352:	subf	r6,r7,r6
    136
    137	/* our main loop does 128 bytes at a time */
    138	srdi	r7,r7,7
    139
    140	/*
    141	 * Work out the offset into the constants table to start at. Each
    142	 * constant is 16 bytes, and it is used against 128 bytes of input
    143	 * data - 128 / 16 = 8
    144	 */
    145	sldi	r8,r7,4
    146	srdi	r9,r9,3
    147	subf	r8,r8,r9
    148
    149	/* We reduce our final 128 bytes in a separate step */
    150	addi	r7,r7,-1
    151	mtctr	r7
    152
    153	addis	r3,r2,.constants@toc@ha
    154	addi	r3,r3,.constants@toc@l
    155
    156	/* Find the start of our constants */
    157	add	r3,r3,r8
    158
    159	/* zero v0-v7 which will contain our checksums */
    160	vxor	v0,v0,v0
    161	vxor	v1,v1,v1
    162	vxor	v2,v2,v2
    163	vxor	v3,v3,v3
    164	vxor	v4,v4,v4
    165	vxor	v5,v5,v5
    166	vxor	v6,v6,v6
    167	vxor	v7,v7,v7
    168
    169	lvx	const1,0,r3
    170
    171	/*
    172	 * If we are looping back to consume more data we use the values
    173	 * already in v16-v23.
    174	 */
    175	cmpdi	r0,1
    176	beq	2f
    177
    178	/* First warm up pass */
    179	lvx	v16,0,r4
    180	lvx	v17,off16,r4
    181	VPERM(v16,v16,v16,byteswap)
    182	VPERM(v17,v17,v17,byteswap)
    183	lvx	v18,off32,r4
    184	lvx	v19,off48,r4
    185	VPERM(v18,v18,v18,byteswap)
    186	VPERM(v19,v19,v19,byteswap)
    187	lvx	v20,off64,r4
    188	lvx	v21,off80,r4
    189	VPERM(v20,v20,v20,byteswap)
    190	VPERM(v21,v21,v21,byteswap)
    191	lvx	v22,off96,r4
    192	lvx	v23,off112,r4
    193	VPERM(v22,v22,v22,byteswap)
    194	VPERM(v23,v23,v23,byteswap)
    195	addi	r4,r4,8*16
    196
    197	/* xor in initial value */
    198	vxor	v16,v16,v8
    199
    2002:	bdz	.Lfirst_warm_up_done
    201
    202	addi	r3,r3,16
    203	lvx	const2,0,r3
    204
    205	/* Second warm up pass */
    206	VPMSUMD(v8,v16,const1)
    207	lvx	v16,0,r4
    208	VPERM(v16,v16,v16,byteswap)
    209	ori	r2,r2,0
    210
    211	VPMSUMD(v9,v17,const1)
    212	lvx	v17,off16,r4
    213	VPERM(v17,v17,v17,byteswap)
    214	ori	r2,r2,0
    215
    216	VPMSUMD(v10,v18,const1)
    217	lvx	v18,off32,r4
    218	VPERM(v18,v18,v18,byteswap)
    219	ori	r2,r2,0
    220
    221	VPMSUMD(v11,v19,const1)
    222	lvx	v19,off48,r4
    223	VPERM(v19,v19,v19,byteswap)
    224	ori	r2,r2,0
    225
    226	VPMSUMD(v12,v20,const1)
    227	lvx	v20,off64,r4
    228	VPERM(v20,v20,v20,byteswap)
    229	ori	r2,r2,0
    230
    231	VPMSUMD(v13,v21,const1)
    232	lvx	v21,off80,r4
    233	VPERM(v21,v21,v21,byteswap)
    234	ori	r2,r2,0
    235
    236	VPMSUMD(v14,v22,const1)
    237	lvx	v22,off96,r4
    238	VPERM(v22,v22,v22,byteswap)
    239	ori	r2,r2,0
    240
    241	VPMSUMD(v15,v23,const1)
    242	lvx	v23,off112,r4
    243	VPERM(v23,v23,v23,byteswap)
    244
    245	addi	r4,r4,8*16
    246
    247	bdz	.Lfirst_cool_down
    248
    249	/*
    250	 * main loop. We modulo schedule it such that it takes three iterations
    251	 * to complete - first iteration load, second iteration vpmsum, third
    252	 * iteration xor.
    253	 */
    254	.balign	16
    2554:	lvx	const1,0,r3
    256	addi	r3,r3,16
    257	ori	r2,r2,0
    258
    259	vxor	v0,v0,v8
    260	VPMSUMD(v8,v16,const2)
    261	lvx	v16,0,r4
    262	VPERM(v16,v16,v16,byteswap)
    263	ori	r2,r2,0
    264
    265	vxor	v1,v1,v9
    266	VPMSUMD(v9,v17,const2)
    267	lvx	v17,off16,r4
    268	VPERM(v17,v17,v17,byteswap)
    269	ori	r2,r2,0
    270
    271	vxor	v2,v2,v10
    272	VPMSUMD(v10,v18,const2)
    273	lvx	v18,off32,r4
    274	VPERM(v18,v18,v18,byteswap)
    275	ori	r2,r2,0
    276
    277	vxor	v3,v3,v11
    278	VPMSUMD(v11,v19,const2)
    279	lvx	v19,off48,r4
    280	VPERM(v19,v19,v19,byteswap)
    281	lvx	const2,0,r3
    282	ori	r2,r2,0
    283
    284	vxor	v4,v4,v12
    285	VPMSUMD(v12,v20,const1)
    286	lvx	v20,off64,r4
    287	VPERM(v20,v20,v20,byteswap)
    288	ori	r2,r2,0
    289
    290	vxor	v5,v5,v13
    291	VPMSUMD(v13,v21,const1)
    292	lvx	v21,off80,r4
    293	VPERM(v21,v21,v21,byteswap)
    294	ori	r2,r2,0
    295
    296	vxor	v6,v6,v14
    297	VPMSUMD(v14,v22,const1)
    298	lvx	v22,off96,r4
    299	VPERM(v22,v22,v22,byteswap)
    300	ori	r2,r2,0
    301
    302	vxor	v7,v7,v15
    303	VPMSUMD(v15,v23,const1)
    304	lvx	v23,off112,r4
    305	VPERM(v23,v23,v23,byteswap)
    306
    307	addi	r4,r4,8*16
    308
    309	bdnz	4b
    310
    311.Lfirst_cool_down:
    312	/* First cool down pass */
    313	lvx	const1,0,r3
    314	addi	r3,r3,16
    315
    316	vxor	v0,v0,v8
    317	VPMSUMD(v8,v16,const1)
    318	ori	r2,r2,0
    319
    320	vxor	v1,v1,v9
    321	VPMSUMD(v9,v17,const1)
    322	ori	r2,r2,0
    323
    324	vxor	v2,v2,v10
    325	VPMSUMD(v10,v18,const1)
    326	ori	r2,r2,0
    327
    328	vxor	v3,v3,v11
    329	VPMSUMD(v11,v19,const1)
    330	ori	r2,r2,0
    331
    332	vxor	v4,v4,v12
    333	VPMSUMD(v12,v20,const1)
    334	ori	r2,r2,0
    335
    336	vxor	v5,v5,v13
    337	VPMSUMD(v13,v21,const1)
    338	ori	r2,r2,0
    339
    340	vxor	v6,v6,v14
    341	VPMSUMD(v14,v22,const1)
    342	ori	r2,r2,0
    343
    344	vxor	v7,v7,v15
    345	VPMSUMD(v15,v23,const1)
    346	ori	r2,r2,0
    347
    348.Lsecond_cool_down:
    349	/* Second cool down pass */
    350	vxor	v0,v0,v8
    351	vxor	v1,v1,v9
    352	vxor	v2,v2,v10
    353	vxor	v3,v3,v11
    354	vxor	v4,v4,v12
    355	vxor	v5,v5,v13
    356	vxor	v6,v6,v14
    357	vxor	v7,v7,v15
    358
    359#ifdef REFLECT
    360	/*
    361	 * vpmsumd produces a 96 bit result in the least significant bits
    362	 * of the register. Since we are bit reflected we have to shift it
    363	 * left 32 bits so it occupies the least significant bits in the
    364	 * bit reflected domain.
    365	 */
    366	vsldoi	v0,v0,zeroes,4
    367	vsldoi	v1,v1,zeroes,4
    368	vsldoi	v2,v2,zeroes,4
    369	vsldoi	v3,v3,zeroes,4
    370	vsldoi	v4,v4,zeroes,4
    371	vsldoi	v5,v5,zeroes,4
    372	vsldoi	v6,v6,zeroes,4
    373	vsldoi	v7,v7,zeroes,4
    374#endif
    375
    376	/* xor with last 1024 bits */
    377	lvx	v8,0,r4
    378	lvx	v9,off16,r4
    379	VPERM(v8,v8,v8,byteswap)
    380	VPERM(v9,v9,v9,byteswap)
    381	lvx	v10,off32,r4
    382	lvx	v11,off48,r4
    383	VPERM(v10,v10,v10,byteswap)
    384	VPERM(v11,v11,v11,byteswap)
    385	lvx	v12,off64,r4
    386	lvx	v13,off80,r4
    387	VPERM(v12,v12,v12,byteswap)
    388	VPERM(v13,v13,v13,byteswap)
    389	lvx	v14,off96,r4
    390	lvx	v15,off112,r4
    391	VPERM(v14,v14,v14,byteswap)
    392	VPERM(v15,v15,v15,byteswap)
    393
    394	addi	r4,r4,8*16
    395
    396	vxor	v16,v0,v8
    397	vxor	v17,v1,v9
    398	vxor	v18,v2,v10
    399	vxor	v19,v3,v11
    400	vxor	v20,v4,v12
    401	vxor	v21,v5,v13
    402	vxor	v22,v6,v14
    403	vxor	v23,v7,v15
    404
    405	li	r0,1
    406	cmpdi	r6,0
    407	addi	r6,r6,128
    408	bne	1b
    409
    410	/* Work out how many bytes we have left */
    411	andi.	r5,r5,127
    412
    413	/* Calculate where in the constant table we need to start */
    414	subfic	r6,r5,128
    415	add	r3,r3,r6
    416
    417	/* How many 16 byte chunks are in the tail */
    418	srdi	r7,r5,4
    419	mtctr	r7
    420
    421	/*
    422	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
    423	 * 32 bits to include the trailing 32 bits of zeros
    424	 */
    425	lvx	v0,0,r3
    426	lvx	v1,off16,r3
    427	lvx	v2,off32,r3
    428	lvx	v3,off48,r3
    429	lvx	v4,off64,r3
    430	lvx	v5,off80,r3
    431	lvx	v6,off96,r3
    432	lvx	v7,off112,r3
    433	addi	r3,r3,8*16
    434
    435	VPMSUMW(v0,v16,v0)
    436	VPMSUMW(v1,v17,v1)
    437	VPMSUMW(v2,v18,v2)
    438	VPMSUMW(v3,v19,v3)
    439	VPMSUMW(v4,v20,v4)
    440	VPMSUMW(v5,v21,v5)
    441	VPMSUMW(v6,v22,v6)
    442	VPMSUMW(v7,v23,v7)
    443
    444	/* Now reduce the tail (0 - 112 bytes) */
    445	cmpdi	r7,0
    446	beq	1f
    447
    448	lvx	v16,0,r4
    449	lvx	v17,0,r3
    450	VPERM(v16,v16,v16,byteswap)
    451	VPMSUMW(v16,v16,v17)
    452	vxor	v0,v0,v16
    453	bdz	1f
    454
    455	lvx	v16,off16,r4
    456	lvx	v17,off16,r3
    457	VPERM(v16,v16,v16,byteswap)
    458	VPMSUMW(v16,v16,v17)
    459	vxor	v0,v0,v16
    460	bdz	1f
    461
    462	lvx	v16,off32,r4
    463	lvx	v17,off32,r3
    464	VPERM(v16,v16,v16,byteswap)
    465	VPMSUMW(v16,v16,v17)
    466	vxor	v0,v0,v16
    467	bdz	1f
    468
    469	lvx	v16,off48,r4
    470	lvx	v17,off48,r3
    471	VPERM(v16,v16,v16,byteswap)
    472	VPMSUMW(v16,v16,v17)
    473	vxor	v0,v0,v16
    474	bdz	1f
    475
    476	lvx	v16,off64,r4
    477	lvx	v17,off64,r3
    478	VPERM(v16,v16,v16,byteswap)
    479	VPMSUMW(v16,v16,v17)
    480	vxor	v0,v0,v16
    481	bdz	1f
    482
    483	lvx	v16,off80,r4
    484	lvx	v17,off80,r3
    485	VPERM(v16,v16,v16,byteswap)
    486	VPMSUMW(v16,v16,v17)
    487	vxor	v0,v0,v16
    488	bdz	1f
    489
    490	lvx	v16,off96,r4
    491	lvx	v17,off96,r3
    492	VPERM(v16,v16,v16,byteswap)
    493	VPMSUMW(v16,v16,v17)
    494	vxor	v0,v0,v16
    495
    496	/* Now xor all the parallel chunks together */
    4971:	vxor	v0,v0,v1
    498	vxor	v2,v2,v3
    499	vxor	v4,v4,v5
    500	vxor	v6,v6,v7
    501
    502	vxor	v0,v0,v2
    503	vxor	v4,v4,v6
    504
    505	vxor	v0,v0,v4
    506
    507.Lbarrett_reduction:
    508	/* Barrett constants */
    509	addis	r3,r2,.barrett_constants@toc@ha
    510	addi	r3,r3,.barrett_constants@toc@l
    511
    512	lvx	const1,0,r3
    513	lvx	const2,off16,r3
    514
    515	vsldoi	v1,v0,v0,8
    516	vxor	v0,v0,v1		/* xor two 64 bit results together */
    517
    518#ifdef REFLECT
    519	/* shift left one bit */
    520	vspltisb v1,1
    521	vsl	v0,v0,v1
    522#endif
    523
    524	vand	v0,v0,mask_64bit
    525#ifndef REFLECT
    526	/*
    527	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
    528	 * the multiple of our polynomial that we need to subtract. By
    529	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
    530	 * result back down 2x bits, we round down to the nearest multiple.
    531	 */
    532	VPMSUMD(v1,v0,const1)	/* ma */
    533	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
    534	VPMSUMD(v1,v1,const2)	/* qn */
    535	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
    536
    537	/*
    538	 * Get the result into r3. We need to shift it left 8 bytes:
    539	 * V0 [ 0 1 2 X ]
    540	 * V0 [ 0 X 2 3 ]
    541	 */
    542	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
    543#else
    544	/*
    545	 * The reflected version of Barrett reduction. Instead of bit
    546	 * reflecting our data (which is expensive to do), we bit reflect our
    547	 * constants and our algorithm, which means the intermediate data in
    548	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
    549	 * the algorithm because we don't carry in mod 2 arithmetic.
    550	 */
    551	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
    552	VPMSUMD(v1,v1,const1)		/* ma */
    553	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
    554	VPMSUMD(v1,v1,const2)		/* qn */
    555	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
    556
    557	/*
    558	 * Since we are bit reflected, the result (ie the low 32 bits) is in
    559	 * the high 32 bits. We just need to shift it left 4 bytes
    560	 * V0 [ 0 1 X 3 ]
    561	 * V0 [ 0 X 2 3 ]
    562	 */
    563	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
    564#endif
    565
    566	/* Get it into r3 */
    567	MFVRD(R3, v0)
    568
    569.Lout:
    570	subi	r6,r1,56+10*16
    571	subi	r7,r1,56+2*16
    572
    573	lvx	v20,0,r6
    574	lvx	v21,off16,r6
    575	lvx	v22,off32,r6
    576	lvx	v23,off48,r6
    577	lvx	v24,off64,r6
    578	lvx	v25,off80,r6
    579	lvx	v26,off96,r6
    580	lvx	v27,off112,r6
    581	lvx	v28,0,r7
    582	lvx	v29,off16,r7
    583
    584	ld	r31,-8(r1)
    585	ld	r30,-16(r1)
    586	ld	r29,-24(r1)
    587	ld	r28,-32(r1)
    588	ld	r27,-40(r1)
    589	ld	r26,-48(r1)
    590	ld	r25,-56(r1)
    591
    592	blr
    593
    594.Lfirst_warm_up_done:
    595	lvx	const1,0,r3
    596	addi	r3,r3,16
    597
    598	VPMSUMD(v8,v16,const1)
    599	VPMSUMD(v9,v17,const1)
    600	VPMSUMD(v10,v18,const1)
    601	VPMSUMD(v11,v19,const1)
    602	VPMSUMD(v12,v20,const1)
    603	VPMSUMD(v13,v21,const1)
    604	VPMSUMD(v14,v22,const1)
    605	VPMSUMD(v15,v23,const1)
    606
    607	b	.Lsecond_cool_down
    608
    609.Lshort:
    610	cmpdi	r5,0
    611	beq	.Lzero
    612
    613	addis	r3,r2,.short_constants@toc@ha
    614	addi	r3,r3,.short_constants@toc@l
    615
    616	/* Calculate where in the constant table we need to start */
    617	subfic	r6,r5,256
    618	add	r3,r3,r6
    619
    620	/* How many 16 byte chunks? */
    621	srdi	r7,r5,4
    622	mtctr	r7
    623
    624	vxor	v19,v19,v19
    625	vxor	v20,v20,v20
    626
    627	lvx	v0,0,r4
    628	lvx	v16,0,r3
    629	VPERM(v0,v0,v16,byteswap)
    630	vxor	v0,v0,v8	/* xor in initial value */
    631	VPMSUMW(v0,v0,v16)
    632	bdz	.Lv0
    633
    634	lvx	v1,off16,r4
    635	lvx	v17,off16,r3
    636	VPERM(v1,v1,v17,byteswap)
    637	VPMSUMW(v1,v1,v17)
    638	bdz	.Lv1
    639
    640	lvx	v2,off32,r4
    641	lvx	v16,off32,r3
    642	VPERM(v2,v2,v16,byteswap)
    643	VPMSUMW(v2,v2,v16)
    644	bdz	.Lv2
    645
    646	lvx	v3,off48,r4
    647	lvx	v17,off48,r3
    648	VPERM(v3,v3,v17,byteswap)
    649	VPMSUMW(v3,v3,v17)
    650	bdz	.Lv3
    651
    652	lvx	v4,off64,r4
    653	lvx	v16,off64,r3
    654	VPERM(v4,v4,v16,byteswap)
    655	VPMSUMW(v4,v4,v16)
    656	bdz	.Lv4
    657
    658	lvx	v5,off80,r4
    659	lvx	v17,off80,r3
    660	VPERM(v5,v5,v17,byteswap)
    661	VPMSUMW(v5,v5,v17)
    662	bdz	.Lv5
    663
    664	lvx	v6,off96,r4
    665	lvx	v16,off96,r3
    666	VPERM(v6,v6,v16,byteswap)
    667	VPMSUMW(v6,v6,v16)
    668	bdz	.Lv6
    669
    670	lvx	v7,off112,r4
    671	lvx	v17,off112,r3
    672	VPERM(v7,v7,v17,byteswap)
    673	VPMSUMW(v7,v7,v17)
    674	bdz	.Lv7
    675
    676	addi	r3,r3,128
    677	addi	r4,r4,128
    678
    679	lvx	v8,0,r4
    680	lvx	v16,0,r3
    681	VPERM(v8,v8,v16,byteswap)
    682	VPMSUMW(v8,v8,v16)
    683	bdz	.Lv8
    684
    685	lvx	v9,off16,r4
    686	lvx	v17,off16,r3
    687	VPERM(v9,v9,v17,byteswap)
    688	VPMSUMW(v9,v9,v17)
    689	bdz	.Lv9
    690
    691	lvx	v10,off32,r4
    692	lvx	v16,off32,r3
    693	VPERM(v10,v10,v16,byteswap)
    694	VPMSUMW(v10,v10,v16)
    695	bdz	.Lv10
    696
    697	lvx	v11,off48,r4
    698	lvx	v17,off48,r3
    699	VPERM(v11,v11,v17,byteswap)
    700	VPMSUMW(v11,v11,v17)
    701	bdz	.Lv11
    702
    703	lvx	v12,off64,r4
    704	lvx	v16,off64,r3
    705	VPERM(v12,v12,v16,byteswap)
    706	VPMSUMW(v12,v12,v16)
    707	bdz	.Lv12
    708
    709	lvx	v13,off80,r4
    710	lvx	v17,off80,r3
    711	VPERM(v13,v13,v17,byteswap)
    712	VPMSUMW(v13,v13,v17)
    713	bdz	.Lv13
    714
    715	lvx	v14,off96,r4
    716	lvx	v16,off96,r3
    717	VPERM(v14,v14,v16,byteswap)
    718	VPMSUMW(v14,v14,v16)
    719	bdz	.Lv14
    720
    721	lvx	v15,off112,r4
    722	lvx	v17,off112,r3
    723	VPERM(v15,v15,v17,byteswap)
    724	VPMSUMW(v15,v15,v17)
    725
    726.Lv15:	vxor	v19,v19,v15
    727.Lv14:	vxor	v20,v20,v14
    728.Lv13:	vxor	v19,v19,v13
    729.Lv12:	vxor	v20,v20,v12
    730.Lv11:	vxor	v19,v19,v11
    731.Lv10:	vxor	v20,v20,v10
    732.Lv9:	vxor	v19,v19,v9
    733.Lv8:	vxor	v20,v20,v8
    734.Lv7:	vxor	v19,v19,v7
    735.Lv6:	vxor	v20,v20,v6
    736.Lv5:	vxor	v19,v19,v5
    737.Lv4:	vxor	v20,v20,v4
    738.Lv3:	vxor	v19,v19,v3
    739.Lv2:	vxor	v20,v20,v2
    740.Lv1:	vxor	v19,v19,v1
    741.Lv0:	vxor	v20,v20,v0
    742
    743	vxor	v0,v19,v20
    744
    745	b	.Lbarrett_reduction
    746
    747.Lzero:
    748	mr	r3,r10
    749	b	.Lout
    750
    751FUNC_END(CRC_FUNCTION_NAME)