csum_partial.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
csum_partial.S (16274B)
      1/*
      2 * This file is subject to the terms and conditions of the GNU General Public
      3 * License.  See the file "COPYING" in the main directory of this archive
      4 * for more details.
      5 *
      6 * Quick'n'dirty IP checksum ...
      7 *
      8 * Copyright (C) 1998, 1999 Ralf Baechle
      9 * Copyright (C) 1999 Silicon Graphics, Inc.
     10 * Copyright (C) 2007  Maciej W. Rozycki
     11 * Copyright (C) 2014 Imagination Technologies Ltd.
     12 */
     13#include <linux/errno.h>
     14#include <asm/asm.h>
     15#include <asm/asm-offsets.h>
     16#include <asm/export.h>
     17#include <asm/regdef.h>
     18
     19#ifdef CONFIG_64BIT
     20/*
     21 * As we are sharing code base with the mips32 tree (which use the o32 ABI
     22 * register definitions). We need to redefine the register definitions from
     23 * the n64 ABI register naming to the o32 ABI register naming.
     24 */
     25#undef t0
     26#undef t1
     27#undef t2
     28#undef t3
     29#define t0	$8
     30#define t1	$9
     31#define t2	$10
     32#define t3	$11
     33#define t4	$12
     34#define t5	$13
     35#define t6	$14
     36#define t7	$15
     37
     38#define USE_DOUBLE
     39#endif
     40
     41#ifdef USE_DOUBLE
     42
     43#define LOAD   ld
     44#define LOAD32 lwu
     45#define ADD    daddu
     46#define NBYTES 8
     47
     48#else
     49
     50#define LOAD   lw
     51#define LOAD32 lw
     52#define ADD    addu
     53#define NBYTES 4
     54
     55#endif /* USE_DOUBLE */
     56
     57#define UNIT(unit)  ((unit)*NBYTES)
     58
     59#define ADDC(sum,reg)						\
     60	.set	push;						\
     61	.set	noat;						\
     62	ADD	sum, reg;					\
     63	sltu	v1, sum, reg;					\
     64	ADD	sum, v1;					\
     65	.set	pop
     66
     67#define ADDC32(sum,reg)						\
     68	.set	push;						\
     69	.set	noat;						\
     70	addu	sum, reg;					\
     71	sltu	v1, sum, reg;					\
     72	addu	sum, v1;					\
     73	.set	pop
     74
     75#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
     76	LOAD	_t0, (offset + UNIT(0))(src);			\
     77	LOAD	_t1, (offset + UNIT(1))(src);			\
     78	LOAD	_t2, (offset + UNIT(2))(src);			\
     79	LOAD	_t3, (offset + UNIT(3))(src);			\
     80	ADDC(_t0, _t1);						\
     81	ADDC(_t2, _t3);						\
     82	ADDC(sum, _t0);						\
     83	ADDC(sum, _t2)
     84
     85#ifdef USE_DOUBLE
     86#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
     87	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
     88#else
     89#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
     90	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
     91	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
     92#endif
     93
     94/*
     95 * a0: source address
     96 * a1: length of the area to checksum
     97 * a2: partial checksum
     98 */
     99
    100#define src a0
    101#define sum v0
    102
    103	.text
    104	.set	noreorder
    105	.align	5
    106LEAF(csum_partial)
    107EXPORT_SYMBOL(csum_partial)
    108	move	sum, zero
    109	move	t7, zero
    110
    111	sltiu	t8, a1, 0x8
    112	bnez	t8, .Lsmall_csumcpy		/* < 8 bytes to copy */
    113	 move	t2, a1
    114
    115	andi	t7, src, 0x1			/* odd buffer? */
    116
    117.Lhword_align:
    118	beqz	t7, .Lword_align
    119	 andi	t8, src, 0x2
    120
    121	lbu	t0, (src)
    122	LONG_SUBU	a1, a1, 0x1
    123#ifdef __MIPSEL__
    124	sll	t0, t0, 8
    125#endif
    126	ADDC(sum, t0)
    127	PTR_ADDU	src, src, 0x1
    128	andi	t8, src, 0x2
    129
    130.Lword_align:
    131	beqz	t8, .Ldword_align
    132	 sltiu	t8, a1, 56
    133
    134	lhu	t0, (src)
    135	LONG_SUBU	a1, a1, 0x2
    136	ADDC(sum, t0)
    137	sltiu	t8, a1, 56
    138	PTR_ADDU	src, src, 0x2
    139
    140.Ldword_align:
    141	bnez	t8, .Ldo_end_words
    142	 move	t8, a1
    143
    144	andi	t8, src, 0x4
    145	beqz	t8, .Lqword_align
    146	 andi	t8, src, 0x8
    147
    148	LOAD32	t0, 0x00(src)
    149	LONG_SUBU	a1, a1, 0x4
    150	ADDC(sum, t0)
    151	PTR_ADDU	src, src, 0x4
    152	andi	t8, src, 0x8
    153
    154.Lqword_align:
    155	beqz	t8, .Loword_align
    156	 andi	t8, src, 0x10
    157
    158#ifdef USE_DOUBLE
    159	ld	t0, 0x00(src)
    160	LONG_SUBU	a1, a1, 0x8
    161	ADDC(sum, t0)
    162#else
    163	lw	t0, 0x00(src)
    164	lw	t1, 0x04(src)
    165	LONG_SUBU	a1, a1, 0x8
    166	ADDC(sum, t0)
    167	ADDC(sum, t1)
    168#endif
    169	PTR_ADDU	src, src, 0x8
    170	andi	t8, src, 0x10
    171
    172.Loword_align:
    173	beqz	t8, .Lbegin_movement
    174	 LONG_SRL	t8, a1, 0x7
    175
    176#ifdef USE_DOUBLE
    177	ld	t0, 0x00(src)
    178	ld	t1, 0x08(src)
    179	ADDC(sum, t0)
    180	ADDC(sum, t1)
    181#else
    182	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
    183#endif
    184	LONG_SUBU	a1, a1, 0x10
    185	PTR_ADDU	src, src, 0x10
    186	LONG_SRL	t8, a1, 0x7
    187
    188.Lbegin_movement:
    189	beqz	t8, 1f
    190	 andi	t2, a1, 0x40
    191
    192.Lmove_128bytes:
    193	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
    194	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
    195	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
    196	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
    197	LONG_SUBU	t8, t8, 0x01
    198	.set	reorder				/* DADDI_WAR */
    199	PTR_ADDU	src, src, 0x80
    200	bnez	t8, .Lmove_128bytes
    201	.set	noreorder
    202
    2031:
    204	beqz	t2, 1f
    205	 andi	t2, a1, 0x20
    206
    207.Lmove_64bytes:
    208	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
    209	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
    210	PTR_ADDU	src, src, 0x40
    211
    2121:
    213	beqz	t2, .Ldo_end_words
    214	 andi	t8, a1, 0x1c
    215
    216.Lmove_32bytes:
    217	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
    218	andi	t8, a1, 0x1c
    219	PTR_ADDU	src, src, 0x20
    220
    221.Ldo_end_words:
    222	beqz	t8, .Lsmall_csumcpy
    223	 andi	t2, a1, 0x3
    224	LONG_SRL	t8, t8, 0x2
    225
    226.Lend_words:
    227	LOAD32	t0, (src)
    228	LONG_SUBU	t8, t8, 0x1
    229	ADDC(sum, t0)
    230	.set	reorder				/* DADDI_WAR */
    231	PTR_ADDU	src, src, 0x4
    232	bnez	t8, .Lend_words
    233	.set	noreorder
    234
    235/* unknown src alignment and < 8 bytes to go  */
    236.Lsmall_csumcpy:
    237	move	a1, t2
    238
    239	andi	t0, a1, 4
    240	beqz	t0, 1f
    241	 andi	t0, a1, 2
    242
    243	/* Still a full word to go  */
    244	ulw	t1, (src)
    245	PTR_ADDIU	src, 4
    246#ifdef USE_DOUBLE
    247	dsll	t1, t1, 32			/* clear lower 32bit */
    248#endif
    249	ADDC(sum, t1)
    250
    2511:	move	t1, zero
    252	beqz	t0, 1f
    253	 andi	t0, a1, 1
    254
    255	/* Still a halfword to go  */
    256	ulhu	t1, (src)
    257	PTR_ADDIU	src, 2
    258
    2591:	beqz	t0, 1f
    260	 sll	t1, t1, 16
    261
    262	lbu	t2, (src)
    263	 nop
    264
    265#ifdef __MIPSEB__
    266	sll	t2, t2, 8
    267#endif
    268	or	t1, t2
    269
    2701:	ADDC(sum, t1)
    271
    272	/* fold checksum */
    273#ifdef USE_DOUBLE
    274	dsll32	v1, sum, 0
    275	daddu	sum, v1
    276	sltu	v1, sum, v1
    277	dsra32	sum, sum, 0
    278	addu	sum, v1
    279#endif
    280
    281	/* odd buffer alignment? */
    282#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
    283    defined(CONFIG_CPU_LOONGSON64)
    284	.set	push
    285	.set	arch=mips32r2
    286	wsbh	v1, sum
    287	movn	sum, v1, t7
    288	.set	pop
    289#else
    290	beqz	t7, 1f			/* odd buffer alignment? */
    291	 lui	v1, 0x00ff
    292	addu	v1, 0x00ff
    293	and	t0, sum, v1
    294	sll	t0, t0, 8
    295	srl	sum, sum, 8
    296	and	sum, sum, v1
    297	or	sum, sum, t0
    2981:
    299#endif
    300	.set	reorder
    301	/* Add the passed partial csum.	 */
    302	ADDC32(sum, a2)
    303	jr	ra
    304	.set	noreorder
    305	END(csum_partial)
    306
    307
    308/*
    309 * checksum and copy routines based on memcpy.S
    310 *
    311 *	csum_partial_copy_nocheck(src, dst, len)
    312 *	__csum_partial_copy_kernel(src, dst, len)
    313 *
    314 * See "Spec" in memcpy.S for details.	Unlike __copy_user, all
    315 * function in this file use the standard calling convention.
    316 */
    317
    318#define src a0
    319#define dst a1
    320#define len a2
    321#define sum v0
    322#define odd t8
    323
    324/*
    325 * All exception handlers simply return 0.
    326 */
    327
    328/* Instruction type */
    329#define LD_INSN 1
    330#define ST_INSN 2
    331#define LEGACY_MODE 1
    332#define EVA_MODE    2
    333#define USEROP   1
    334#define KERNELOP 2
    335
    336/*
    337 * Wrapper to add an entry in the exception table
    338 * in case the insn causes a memory exception.
    339 * Arguments:
    340 * insn    : Load/store instruction
    341 * type    : Instruction type
    342 * reg     : Register
    343 * addr    : Address
    344 * handler : Exception handler
    345 */
    346#define EXC(insn, type, reg, addr)		\
    347	.if \mode == LEGACY_MODE;		\
    3489:		insn reg, addr;			\
    349		.section __ex_table,"a";	\
    350		PTR_WD	9b, .L_exc;		\
    351		.previous;			\
    352	/* This is enabled in EVA mode */	\
    353	.else;					\
    354		/* If loading from user or storing to user */	\
    355		.if ((\from == USEROP) && (type == LD_INSN)) || \
    356		    ((\to == USEROP) && (type == ST_INSN));	\
    3579:			__BUILD_EVA_INSN(insn##e, reg, addr);	\
    358			.section __ex_table,"a";		\
    359			PTR_WD	9b, .L_exc;			\
    360			.previous;				\
    361		.else;						\
    362			/* EVA without exception */		\
    363			insn reg, addr;				\
    364		.endif;						\
    365	.endif
    366
    367#undef LOAD
    368
    369#ifdef USE_DOUBLE
    370
    371#define LOADK	ld /* No exception */
    372#define LOAD(reg, addr)		EXC(ld, LD_INSN, reg, addr)
    373#define LOADBU(reg, addr)	EXC(lbu, LD_INSN, reg, addr)
    374#define LOADL(reg, addr)	EXC(ldl, LD_INSN, reg, addr)
    375#define LOADR(reg, addr)	EXC(ldr, LD_INSN, reg, addr)
    376#define STOREB(reg, addr)	EXC(sb, ST_INSN, reg, addr)
    377#define STOREL(reg, addr)	EXC(sdl, ST_INSN, reg, addr)
    378#define STORER(reg, addr)	EXC(sdr, ST_INSN, reg, addr)
    379#define STORE(reg, addr)	EXC(sd, ST_INSN, reg, addr)
    380#define ADD    daddu
    381#define SUB    dsubu
    382#define SRL    dsrl
    383#define SLL    dsll
    384#define SLLV   dsllv
    385#define SRLV   dsrlv
    386#define NBYTES 8
    387#define LOG_NBYTES 3
    388
    389#else
    390
    391#define LOADK	lw /* No exception */
    392#define LOAD(reg, addr)		EXC(lw, LD_INSN, reg, addr)
    393#define LOADBU(reg, addr)	EXC(lbu, LD_INSN, reg, addr)
    394#define LOADL(reg, addr)	EXC(lwl, LD_INSN, reg, addr)
    395#define LOADR(reg, addr)	EXC(lwr, LD_INSN, reg, addr)
    396#define STOREB(reg, addr)	EXC(sb, ST_INSN, reg, addr)
    397#define STOREL(reg, addr)	EXC(swl, ST_INSN, reg, addr)
    398#define STORER(reg, addr)	EXC(swr, ST_INSN, reg, addr)
    399#define STORE(reg, addr)	EXC(sw, ST_INSN, reg, addr)
    400#define ADD    addu
    401#define SUB    subu
    402#define SRL    srl
    403#define SLL    sll
    404#define SLLV   sllv
    405#define SRLV   srlv
    406#define NBYTES 4
    407#define LOG_NBYTES 2
    408
    409#endif /* USE_DOUBLE */
    410
    411#ifdef CONFIG_CPU_LITTLE_ENDIAN
    412#define LDFIRST LOADR
    413#define LDREST	LOADL
    414#define STFIRST STORER
    415#define STREST	STOREL
    416#define SHIFT_DISCARD SLLV
    417#define SHIFT_DISCARD_REVERT SRLV
    418#else
    419#define LDFIRST LOADL
    420#define LDREST	LOADR
    421#define STFIRST STOREL
    422#define STREST	STORER
    423#define SHIFT_DISCARD SRLV
    424#define SHIFT_DISCARD_REVERT SLLV
    425#endif
    426
    427#define FIRST(unit) ((unit)*NBYTES)
    428#define REST(unit)  (FIRST(unit)+NBYTES-1)
    429
    430#define ADDRMASK (NBYTES-1)
    431
    432#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
    433	.set	noat
    434#else
    435	.set	at=v1
    436#endif
    437
    438	.macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to
    439
    440	li	sum, -1
    441	move	odd, zero
    442	/*
    443	 * Note: dst & src may be unaligned, len may be 0
    444	 * Temps
    445	 */
    446	/*
    447	 * The "issue break"s below are very approximate.
    448	 * Issue delays for dcache fills will perturb the schedule, as will
    449	 * load queue full replay traps, etc.
    450	 *
    451	 * If len < NBYTES use byte operations.
    452	 */
    453	sltu	t2, len, NBYTES
    454	and	t1, dst, ADDRMASK
    455	bnez	t2, .Lcopy_bytes_checklen\@
    456	 and	t0, src, ADDRMASK
    457	andi	odd, dst, 0x1			/* odd buffer? */
    458	bnez	t1, .Ldst_unaligned\@
    459	 nop
    460	bnez	t0, .Lsrc_unaligned_dst_aligned\@
    461	/*
    462	 * use delay slot for fall-through
    463	 * src and dst are aligned; need to compute rem
    464	 */
    465.Lboth_aligned\@:
    466	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
    467	beqz	t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
    468	 nop
    469	SUB	len, 8*NBYTES		# subtract here for bgez loop
    470	.align	4
    4711:
    472	LOAD(t0, UNIT(0)(src))
    473	LOAD(t1, UNIT(1)(src))
    474	LOAD(t2, UNIT(2)(src))
    475	LOAD(t3, UNIT(3)(src))
    476	LOAD(t4, UNIT(4)(src))
    477	LOAD(t5, UNIT(5)(src))
    478	LOAD(t6, UNIT(6)(src))
    479	LOAD(t7, UNIT(7)(src))
    480	SUB	len, len, 8*NBYTES
    481	ADD	src, src, 8*NBYTES
    482	STORE(t0, UNIT(0)(dst))
    483	ADDC(t0, t1)
    484	STORE(t1, UNIT(1)(dst))
    485	ADDC(sum, t0)
    486	STORE(t2, UNIT(2)(dst))
    487	ADDC(t2, t3)
    488	STORE(t3, UNIT(3)(dst))
    489	ADDC(sum, t2)
    490	STORE(t4, UNIT(4)(dst))
    491	ADDC(t4, t5)
    492	STORE(t5, UNIT(5)(dst))
    493	ADDC(sum, t4)
    494	STORE(t6, UNIT(6)(dst))
    495	ADDC(t6, t7)
    496	STORE(t7, UNIT(7)(dst))
    497	ADDC(sum, t6)
    498	.set	reorder				/* DADDI_WAR */
    499	ADD	dst, dst, 8*NBYTES
    500	bgez	len, 1b
    501	.set	noreorder
    502	ADD	len, 8*NBYTES		# revert len (see above)
    503
    504	/*
    505	 * len == the number of bytes left to copy < 8*NBYTES
    506	 */
    507.Lcleanup_both_aligned\@:
    508#define rem t7
    509	beqz	len, .Ldone\@
    510	 sltu	t0, len, 4*NBYTES
    511	bnez	t0, .Lless_than_4units\@
    512	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
    513	/*
    514	 * len >= 4*NBYTES
    515	 */
    516	LOAD(t0, UNIT(0)(src))
    517	LOAD(t1, UNIT(1)(src))
    518	LOAD(t2, UNIT(2)(src))
    519	LOAD(t3, UNIT(3)(src))
    520	SUB	len, len, 4*NBYTES
    521	ADD	src, src, 4*NBYTES
    522	STORE(t0, UNIT(0)(dst))
    523	ADDC(t0, t1)
    524	STORE(t1, UNIT(1)(dst))
    525	ADDC(sum, t0)
    526	STORE(t2, UNIT(2)(dst))
    527	ADDC(t2, t3)
    528	STORE(t3, UNIT(3)(dst))
    529	ADDC(sum, t2)
    530	.set	reorder				/* DADDI_WAR */
    531	ADD	dst, dst, 4*NBYTES
    532	beqz	len, .Ldone\@
    533	.set	noreorder
    534.Lless_than_4units\@:
    535	/*
    536	 * rem = len % NBYTES
    537	 */
    538	beq	rem, len, .Lcopy_bytes\@
    539	 nop
    5401:
    541	LOAD(t0, 0(src))
    542	ADD	src, src, NBYTES
    543	SUB	len, len, NBYTES
    544	STORE(t0, 0(dst))
    545	ADDC(sum, t0)
    546	.set	reorder				/* DADDI_WAR */
    547	ADD	dst, dst, NBYTES
    548	bne	rem, len, 1b
    549	.set	noreorder
    550
    551	/*
    552	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
    553	 * A loop would do only a byte at a time with possible branch
    554	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
    555	 * because can't assume read-access to dst.  Instead, use
    556	 * STREST dst, which doesn't require read access to dst.
    557	 *
    558	 * This code should perform better than a simple loop on modern,
    559	 * wide-issue mips processors because the code has fewer branches and
    560	 * more instruction-level parallelism.
    561	 */
    562#define bits t2
    563	beqz	len, .Ldone\@
    564	 ADD	t1, dst, len	# t1 is just past last byte of dst
    565	li	bits, 8*NBYTES
    566	SLL	rem, len, 3	# rem = number of bits to keep
    567	LOAD(t0, 0(src))
    568	SUB	bits, bits, rem # bits = number of bits to discard
    569	SHIFT_DISCARD t0, t0, bits
    570	STREST(t0, -1(t1))
    571	SHIFT_DISCARD_REVERT t0, t0, bits
    572	.set reorder
    573	ADDC(sum, t0)
    574	b	.Ldone\@
    575	.set noreorder
    576.Ldst_unaligned\@:
    577	/*
    578	 * dst is unaligned
    579	 * t0 = src & ADDRMASK
    580	 * t1 = dst & ADDRMASK; T1 > 0
    581	 * len >= NBYTES
    582	 *
    583	 * Copy enough bytes to align dst
    584	 * Set match = (src and dst have same alignment)
    585	 */
    586#define match rem
    587	LDFIRST(t3, FIRST(0)(src))
    588	ADD	t2, zero, NBYTES
    589	LDREST(t3, REST(0)(src))
    590	SUB	t2, t2, t1	# t2 = number of bytes copied
    591	xor	match, t0, t1
    592	STFIRST(t3, FIRST(0)(dst))
    593	SLL	t4, t1, 3		# t4 = number of bits to discard
    594	SHIFT_DISCARD t3, t3, t4
    595	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
    596	ADDC(sum, t3)
    597	beq	len, t2, .Ldone\@
    598	 SUB	len, len, t2
    599	ADD	dst, dst, t2
    600	beqz	match, .Lboth_aligned\@
    601	 ADD	src, src, t2
    602
    603.Lsrc_unaligned_dst_aligned\@:
    604	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
    605	beqz	t0, .Lcleanup_src_unaligned\@
    606	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
    6071:
    608/*
    609 * Avoid consecutive LD*'s to the same register since some mips
    610 * implementations can't issue them in the same cycle.
    611 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
    612 * are to the same unit (unless src is aligned, but it's not).
    613 */
    614	LDFIRST(t0, FIRST(0)(src))
    615	LDFIRST(t1, FIRST(1)(src))
    616	SUB	len, len, 4*NBYTES
    617	LDREST(t0, REST(0)(src))
    618	LDREST(t1, REST(1)(src))
    619	LDFIRST(t2, FIRST(2)(src))
    620	LDFIRST(t3, FIRST(3)(src))
    621	LDREST(t2, REST(2)(src))
    622	LDREST(t3, REST(3)(src))
    623	ADD	src, src, 4*NBYTES
    624#ifdef CONFIG_CPU_SB1
    625	nop				# improves slotting
    626#endif
    627	STORE(t0, UNIT(0)(dst))
    628	ADDC(t0, t1)
    629	STORE(t1, UNIT(1)(dst))
    630	ADDC(sum, t0)
    631	STORE(t2, UNIT(2)(dst))
    632	ADDC(t2, t3)
    633	STORE(t3, UNIT(3)(dst))
    634	ADDC(sum, t2)
    635	.set	reorder				/* DADDI_WAR */
    636	ADD	dst, dst, 4*NBYTES
    637	bne	len, rem, 1b
    638	.set	noreorder
    639
    640.Lcleanup_src_unaligned\@:
    641	beqz	len, .Ldone\@
    642	 and	rem, len, NBYTES-1  # rem = len % NBYTES
    643	beq	rem, len, .Lcopy_bytes\@
    644	 nop
    6451:
    646	LDFIRST(t0, FIRST(0)(src))
    647	LDREST(t0, REST(0)(src))
    648	ADD	src, src, NBYTES
    649	SUB	len, len, NBYTES
    650	STORE(t0, 0(dst))
    651	ADDC(sum, t0)
    652	.set	reorder				/* DADDI_WAR */
    653	ADD	dst, dst, NBYTES
    654	bne	len, rem, 1b
    655	.set	noreorder
    656
    657.Lcopy_bytes_checklen\@:
    658	beqz	len, .Ldone\@
    659	 nop
    660.Lcopy_bytes\@:
    661	/* 0 < len < NBYTES  */
    662#ifdef CONFIG_CPU_LITTLE_ENDIAN
    663#define SHIFT_START 0
    664#define SHIFT_INC 8
    665#else
    666#define SHIFT_START 8*(NBYTES-1)
    667#define SHIFT_INC -8
    668#endif
    669	move	t2, zero	# partial word
    670	li	t3, SHIFT_START # shift
    671#define COPY_BYTE(N)			\
    672	LOADBU(t0, N(src));		\
    673	SUB	len, len, 1;		\
    674	STOREB(t0, N(dst));		\
    675	SLLV	t0, t0, t3;		\
    676	addu	t3, SHIFT_INC;		\
    677	beqz	len, .Lcopy_bytes_done\@; \
    678	 or	t2, t0
    679
    680	COPY_BYTE(0)
    681	COPY_BYTE(1)
    682#ifdef USE_DOUBLE
    683	COPY_BYTE(2)
    684	COPY_BYTE(3)
    685	COPY_BYTE(4)
    686	COPY_BYTE(5)
    687#endif
    688	LOADBU(t0, NBYTES-2(src))
    689	SUB	len, len, 1
    690	STOREB(t0, NBYTES-2(dst))
    691	SLLV	t0, t0, t3
    692	or	t2, t0
    693.Lcopy_bytes_done\@:
    694	ADDC(sum, t2)
    695.Ldone\@:
    696	/* fold checksum */
    697	.set	push
    698	.set	noat
    699#ifdef USE_DOUBLE
    700	dsll32	v1, sum, 0
    701	daddu	sum, v1
    702	sltu	v1, sum, v1
    703	dsra32	sum, sum, 0
    704	addu	sum, v1
    705#endif
    706
    707#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
    708    defined(CONFIG_CPU_LOONGSON64)
    709	.set	push
    710	.set	arch=mips32r2
    711	wsbh	v1, sum
    712	movn	sum, v1, odd
    713	.set	pop
    714#else
    715	beqz	odd, 1f			/* odd buffer alignment? */
    716	 lui	v1, 0x00ff
    717	addu	v1, 0x00ff
    718	and	t0, sum, v1
    719	sll	t0, t0, 8
    720	srl	sum, sum, 8
    721	and	sum, sum, v1
    722	or	sum, sum, t0
    7231:
    724#endif
    725	.set	pop
    726	.set reorder
    727	jr	ra
    728	.set noreorder
    729	.endm
    730
    731	.set noreorder
    732.L_exc:
    733	jr	ra
    734	 li	v0, 0
    735
    736FEXPORT(__csum_partial_copy_nocheck)
    737EXPORT_SYMBOL(__csum_partial_copy_nocheck)
    738#ifndef CONFIG_EVA
    739FEXPORT(__csum_partial_copy_to_user)
    740EXPORT_SYMBOL(__csum_partial_copy_to_user)
    741FEXPORT(__csum_partial_copy_from_user)
    742EXPORT_SYMBOL(__csum_partial_copy_from_user)
    743#endif
    744__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP
    745
    746#ifdef CONFIG_EVA
    747LEAF(__csum_partial_copy_to_user)
    748__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP
    749END(__csum_partial_copy_to_user)
    750
    751LEAF(__csum_partial_copy_from_user)
    752__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP
    753END(__csum_partial_copy_from_user)
    754#endif