cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

NGmemcpy.S (13373B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/* NGmemcpy.S: Niagara optimized memcpy.
      3 *
      4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
      5 */
      6
      7#ifdef __KERNEL__
      8#include <linux/linkage.h>
      9#include <asm/asi.h>
     10#include <asm/thread_info.h>
     11#define GLOBAL_SPARE	%g7
     12#define RESTORE_ASI(TMP)	\
     13	wr	%g0, ASI_AIUS, %asi
     14#else
     15#define GLOBAL_SPARE	%g5
     16#define RESTORE_ASI(TMP)	\
     17	wr	%g0, ASI_PNF, %asi
     18#endif
     19
     20#ifdef __sparc_v9__
     21#define SAVE_AMOUNT	128
     22#else
     23#define SAVE_AMOUNT	64
     24#endif
     25
     26#ifndef STORE_ASI
     27#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
     28#endif
     29
     30#ifndef EX_LD
     31#define EX_LD(x,y)	x
     32#endif
     33
     34#ifndef EX_ST
     35#define EX_ST(x,y)	x
     36#endif
     37
     38#ifndef LOAD
     39#ifndef MEMCPY_DEBUG
     40#define LOAD(type,addr,dest)	type [addr], dest
     41#else
     42#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
     43#endif
     44#endif
     45
     46#ifndef LOAD_TWIN
     47#define LOAD_TWIN(addr_reg,dest0,dest1)	\
     48	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
     49#endif
     50
     51#ifndef STORE
     52#define STORE(type,src,addr)	type src, [addr]
     53#endif
     54
     55#ifndef STORE_INIT
     56#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
     57#define STORE_INIT(src,addr)	stxa src, [addr] %asi
     58#else
     59#define STORE_INIT(src,addr)	stx src, [addr + 0x00]
     60#endif
     61#endif
     62
     63#ifndef FUNC_NAME
     64#define FUNC_NAME	NGmemcpy
     65#endif
     66
     67#ifndef PREAMBLE
     68#define PREAMBLE
     69#endif
     70
     71#ifndef XCC
     72#define XCC xcc
     73#endif
     74
     75	.register	%g2,#scratch
     76	.register	%g3,#scratch
     77
     78	.text
     79#ifndef EX_RETVAL
     80#define EX_RETVAL(x)	x
     81__restore_asi:
     82	ret
     83	wr	%g0, ASI_AIUS, %asi
     84	 restore
     85ENTRY(NG_ret_i2_plus_i4_plus_1)
     86	ba,pt	%xcc, __restore_asi
     87	 add	%i2, %i5, %i0
     88ENDPROC(NG_ret_i2_plus_i4_plus_1)
     89ENTRY(NG_ret_i2_plus_g1)
     90	ba,pt	%xcc, __restore_asi
     91	 add	%i2, %g1, %i0
     92ENDPROC(NG_ret_i2_plus_g1)
     93ENTRY(NG_ret_i2_plus_g1_minus_8)
     94	sub	%g1, 8, %g1
     95	ba,pt	%xcc, __restore_asi
     96	 add	%i2, %g1, %i0
     97ENDPROC(NG_ret_i2_plus_g1_minus_8)
     98ENTRY(NG_ret_i2_plus_g1_minus_16)
     99	sub	%g1, 16, %g1
    100	ba,pt	%xcc, __restore_asi
    101	 add	%i2, %g1, %i0
    102ENDPROC(NG_ret_i2_plus_g1_minus_16)
    103ENTRY(NG_ret_i2_plus_g1_minus_24)
    104	sub	%g1, 24, %g1
    105	ba,pt	%xcc, __restore_asi
    106	 add	%i2, %g1, %i0
    107ENDPROC(NG_ret_i2_plus_g1_minus_24)
    108ENTRY(NG_ret_i2_plus_g1_minus_32)
    109	sub	%g1, 32, %g1
    110	ba,pt	%xcc, __restore_asi
    111	 add	%i2, %g1, %i0
    112ENDPROC(NG_ret_i2_plus_g1_minus_32)
    113ENTRY(NG_ret_i2_plus_g1_minus_40)
    114	sub	%g1, 40, %g1
    115	ba,pt	%xcc, __restore_asi
    116	 add	%i2, %g1, %i0
    117ENDPROC(NG_ret_i2_plus_g1_minus_40)
    118ENTRY(NG_ret_i2_plus_g1_minus_48)
    119	sub	%g1, 48, %g1
    120	ba,pt	%xcc, __restore_asi
    121	 add	%i2, %g1, %i0
    122ENDPROC(NG_ret_i2_plus_g1_minus_48)
    123ENTRY(NG_ret_i2_plus_g1_minus_56)
    124	sub	%g1, 56, %g1
    125	ba,pt	%xcc, __restore_asi
    126	 add	%i2, %g1, %i0
    127ENDPROC(NG_ret_i2_plus_g1_minus_56)
    128ENTRY(NG_ret_i2_plus_i4)
    129	ba,pt	%xcc, __restore_asi
    130	 add	%i2, %i4, %i0
    131ENDPROC(NG_ret_i2_plus_i4)
    132ENTRY(NG_ret_i2_plus_i4_minus_8)
    133	sub	%i4, 8, %i4
    134	ba,pt	%xcc, __restore_asi
    135	 add	%i2, %i4, %i0
    136ENDPROC(NG_ret_i2_plus_i4_minus_8)
    137ENTRY(NG_ret_i2_plus_8)
    138	ba,pt	%xcc, __restore_asi
    139	 add	%i2, 8, %i0
    140ENDPROC(NG_ret_i2_plus_8)
    141ENTRY(NG_ret_i2_plus_4)
    142	ba,pt	%xcc, __restore_asi
    143	 add	%i2, 4, %i0
    144ENDPROC(NG_ret_i2_plus_4)
    145ENTRY(NG_ret_i2_plus_1)
    146	ba,pt	%xcc, __restore_asi
    147	 add	%i2, 1, %i0
    148ENDPROC(NG_ret_i2_plus_1)
    149ENTRY(NG_ret_i2_plus_g1_plus_1)
    150	add	%g1, 1, %g1
    151	ba,pt	%xcc, __restore_asi
    152	 add	%i2, %g1, %i0
    153ENDPROC(NG_ret_i2_plus_g1_plus_1)
    154ENTRY(NG_ret_i2)
    155	ba,pt	%xcc, __restore_asi
    156	 mov	%i2, %i0
    157ENDPROC(NG_ret_i2)
    158ENTRY(NG_ret_i2_and_7_plus_i4)
    159	and	%i2, 7, %i2
    160	ba,pt	%xcc, __restore_asi
    161	 add	%i2, %i4, %i0
    162ENDPROC(NG_ret_i2_and_7_plus_i4)
    163#endif
    164
    165	.align		64
    166
    167	.globl	FUNC_NAME
    168	.type	FUNC_NAME,#function
    169FUNC_NAME:	/* %i0=dst, %i1=src, %i2=len */
    170	PREAMBLE
    171	save		%sp, -SAVE_AMOUNT, %sp
    172	srlx		%i2, 31, %g2
    173	cmp		%g2, 0
    174	tne		%xcc, 5
    175	mov		%i0, %o0
    176	cmp		%i2, 0
    177	be,pn		%XCC, 85f
    178	 or		%o0, %i1, %i3
    179	cmp		%i2, 16
    180	blu,a,pn	%XCC, 80f
    181	 or		%i3, %i2, %i3
    182
    183	/* 2 blocks (128 bytes) is the minimum we can do the block
    184	 * copy with.  We need to ensure that we'll iterate at least
    185	 * once in the block copy loop.  At worst we'll need to align
    186	 * the destination to a 64-byte boundary which can chew up
    187	 * to (64 - 1) bytes from the length before we perform the
    188	 * block copy loop.
    189	 */
    190	cmp		%i2, (2 * 64)
    191	blu,pt		%XCC, 70f
    192	 andcc		%i3, 0x7, %g0
    193
    194	/* %o0:	dst
    195	 * %i1:	src
    196	 * %i2:	len  (known to be >= 128)
    197	 *
    198	 * The block copy loops will use %i4/%i5,%g2/%g3 as
    199	 * temporaries while copying the data.
    200	 */
    201
    202	LOAD(prefetch, %i1, #one_read)
    203	wr		%g0, STORE_ASI, %asi
    204
    205	/* Align destination on 64-byte boundary.  */
    206	andcc		%o0, (64 - 1), %i4
    207	be,pt		%XCC, 2f
    208	 sub		%i4, 64, %i4
    209	sub		%g0, %i4, %i4	! bytes to align dst
    210	sub		%i2, %i4, %i2
    2111:	subcc		%i4, 1, %i4
    212	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
    213	EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
    214	add		%i1, 1, %i1
    215	bne,pt		%XCC, 1b
    216	add		%o0, 1, %o0
    217
    218	/* If the source is on a 16-byte boundary we can do
    219	 * the direct block copy loop.  If it is 8-byte aligned
    220	 * we can do the 16-byte loads offset by -8 bytes and the
    221	 * init stores offset by one register.
    222	 *
    223	 * If the source is not even 8-byte aligned, we need to do
    224	 * shifting and masking (basically integer faligndata).
    225	 *
    226	 * The careful bit with init stores is that if we store
    227	 * to any part of the cache line we have to store the whole
    228	 * cacheline else we can end up with corrupt L2 cache line
    229	 * contents.  Since the loop works on 64-bytes of 64-byte
    230	 * aligned store data at a time, this is easy to ensure.
    231	 */
    2322:
    233	andcc		%i1, (16 - 1), %i4
    234	andn		%i2, (64 - 1), %g1	! block copy loop iterator
    235	be,pt		%XCC, 50f
    236	 sub		%i2, %g1, %i2		! final sub-block copy bytes
    237
    238	cmp		%i4, 8
    239	be,pt		%XCC, 10f
    240	 sub		%i1, %i4, %i1
    241
    242	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
    243	and		%i4, 0x7, GLOBAL_SPARE
    244	sll		GLOBAL_SPARE, 3, GLOBAL_SPARE
    245	mov		64, %i5
    246	EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
    247	sub		%i5, GLOBAL_SPARE, %i5
    248	mov		16, %o4
    249	mov		32, %o5
    250	mov		48, %o7
    251	mov		64, %i3
    252
    253	bg,pn	   	%XCC, 9f
    254	 nop
    255
    256#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
    257	sllx		WORD1, POST_SHIFT, WORD1; \
    258	srlx		WORD2, PRE_SHIFT, TMP; \
    259	sllx		WORD2, POST_SHIFT, WORD2; \
    260	or		WORD1, TMP, WORD1; \
    261	srlx		WORD3, PRE_SHIFT, TMP; \
    262	or		WORD2, TMP, WORD2;
    263
    2648:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
    265	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
    266	LOAD(prefetch, %i1 + %i3, #one_read)
    267
    268	EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
    269	EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
    270
    271	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
    272	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
    273
    274	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
    275	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
    276
    277	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
    278	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
    279
    280	EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
    281	EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
    282
    283	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
    284	add		%i1, 64, %i1
    285	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
    286
    287	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
    288	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
    289
    290	subcc		%g1, 64, %g1
    291	bne,pt		%XCC, 8b
    292	 add		%o0, 64, %o0
    293
    294	ba,pt		%XCC, 60f
    295	 add		%i1, %i4, %i1
    296
    2979:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
    298	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
    299	LOAD(prefetch, %i1 + %i3, #one_read)
    300
    301	EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
    302	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
    303
    304	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
    305	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
    306
    307	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
    308	EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
    309
    310	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
    311	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
    312
    313	EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
    314	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
    315
    316	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
    317	add		%i1, 64, %i1
    318	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
    319
    320	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
    321	EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
    322
    323	subcc		%g1, 64, %g1
    324	bne,pt		%XCC, 9b
    325	 add		%o0, 64, %o0
    326
    327	ba,pt		%XCC, 60f
    328	 add		%i1, %i4, %i1
    329
    33010:	/* Destination is 64-byte aligned, source was only 8-byte
    331	 * aligned but it has been subtracted by 8 and we perform
    332	 * one twin load ahead, then add 8 back into source when
    333	 * we finish the loop.
    334	 */
    335	EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
    336	mov	16, %o7
    337	mov	32, %g2
    338	mov	48, %g3
    339	mov	64, %o1
    3401:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
    341	LOAD(prefetch, %i1 + %o1, #one_read)
    342	EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
    343	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
    344	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
    345	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
    346	EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
    347	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
    348	EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
    349	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
    350	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
    351	add		%i1, 64, %i1
    352	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
    353	EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
    354	subcc		%g1, 64, %g1
    355	bne,pt		%XCC, 1b
    356	 add		%o0, 64, %o0
    357
    358	ba,pt		%XCC, 60f
    359	 add		%i1, 0x8, %i1
    360
    36150:	/* Destination is 64-byte aligned, and source is 16-byte
    362	 * aligned.
    363	 */
    364	mov	16, %o7
    365	mov	32, %g2
    366	mov	48, %g3
    367	mov	64, %o1
    3681:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
    369	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
    370	LOAD(prefetch, %i1 + %o1, #one_read)
    371	EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
    372	EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
    373	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
    374	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
    375	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
    376	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
    377	add	%i1, 64, %i1
    378	EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
    379	EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
    380	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
    381	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
    382	subcc	%g1, 64, %g1
    383	bne,pt	%XCC, 1b
    384	 add	%o0, 64, %o0
    385	/* fall through */
    386
    38760:	
    388	membar		#Sync
    389
    390	/* %i2 contains any final bytes still needed to be copied
    391	 * over. If anything is left, we copy it one byte at a time.
    392	 */
    393	RESTORE_ASI(%i3)
    394	brz,pt		%i2, 85f
    395	 sub		%o0, %i1, %i3
    396	ba,a,pt		%XCC, 90f
    397	 nop
    398
    399	.align		64
    40070: /* 16 < len <= 64 */
    401	bne,pn		%XCC, 75f
    402	 sub		%o0, %i1, %i3
    403
    40472:
    405	andn		%i2, 0xf, %i4
    406	and		%i2, 0xf, %i2
    4071:	subcc		%i4, 0x10, %i4
    408	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
    409	add		%i1, 0x08, %i1
    410	EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
    411	sub		%i1, 0x08, %i1
    412	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
    413	add		%i1, 0x8, %i1
    414	EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
    415	bgu,pt		%XCC, 1b
    416	 add		%i1, 0x8, %i1
    41773:	andcc		%i2, 0x8, %g0
    418	be,pt		%XCC, 1f
    419	 nop
    420	sub		%i2, 0x8, %i2
    421	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
    422	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
    423	add		%i1, 0x8, %i1
    4241:	andcc		%i2, 0x4, %g0
    425	be,pt		%XCC, 1f
    426	 nop
    427	sub		%i2, 0x4, %i2
    428	EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
    429	EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
    430	add		%i1, 0x4, %i1
    4311:	cmp		%i2, 0
    432	be,pt		%XCC, 85f
    433	 nop
    434	ba,pt		%xcc, 90f
    435	 nop
    436
    43775:
    438	andcc		%o0, 0x7, %g1
    439	sub		%g1, 0x8, %g1
    440	be,pn		%icc, 2f
    441	 sub		%g0, %g1, %g1
    442	sub		%i2, %g1, %i2
    443
    4441:	subcc		%g1, 1, %g1
    445	EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
    446	EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
    447	bgu,pt		%icc, 1b
    448	 add		%i1, 1, %i1
    449
    4502:	add		%i1, %i3, %o0
    451	andcc		%i1, 0x7, %g1
    452	bne,pt		%icc, 8f
    453	 sll		%g1, 3, %g1
    454
    455	cmp		%i2, 16
    456	bgeu,pt		%icc, 72b
    457	 nop
    458	ba,a,pt		%xcc, 73b
    459
    4608:	mov		64, %i3
    461	andn		%i1, 0x7, %i1
    462	EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
    463	sub		%i3, %g1, %i3
    464	andn		%i2, 0x7, %i4
    465	sllx		%g2, %g1, %g2
    4661:	add		%i1, 0x8, %i1
    467	EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
    468	subcc		%i4, 0x8, %i4
    469	srlx		%g3, %i3, %i5
    470	or		%i5, %g2, %i5
    471	EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
    472	add		%o0, 0x8, %o0
    473	bgu,pt		%icc, 1b
    474	 sllx		%g3, %g1, %g2
    475
    476	srl		%g1, 3, %g1
    477	andcc		%i2, 0x7, %i2
    478	be,pn		%icc, 85f
    479	 add		%i1, %g1, %i1
    480	ba,pt		%xcc, 90f
    481	 sub		%o0, %i1, %i3
    482
    483	.align		64
    48480: /* 0 < len <= 16 */
    485	andcc		%i3, 0x3, %g0
    486	bne,pn		%XCC, 90f
    487	 sub		%o0, %i1, %i3
    488
    4891:
    490	subcc		%i2, 4, %i2
    491	EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
    492	EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
    493	bgu,pt		%XCC, 1b
    494	 add		%i1, 4, %i1
    495
    49685:	ret
    497	 restore	EX_RETVAL(%i0), %g0, %o0
    498
    499	.align		32
    50090:
    501	subcc		%i2, 1, %i2
    502	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
    503	EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
    504	bgu,pt		%XCC, 90b
    505	 add		%i1, 1, %i1
    506	ret
    507	 restore	EX_RETVAL(%i0), %g0, %o0
    508
    509	.size		FUNC_NAME, .-FUNC_NAME