cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

M7memcpy.S (31360B)


      1/*
      2 * M7memcpy: Optimized SPARC M7 memcpy
      3 *
      4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
      5 */
      6
      7	.file	"M7memcpy.S"
      8
      9/*
     10 * memcpy(s1, s2, len)
     11 *
     12 * Copy s2 to s1, always copy n bytes.
     13 * Note: this C code does not work for overlapped copies.
     14 *
     15 * Fast assembler language version of the following C-program for memcpy
     16 * which represents the `standard' for the C-library.
     17 *
     18 *	void *
     19 *	memcpy(void *s, const void *s0, size_t n)
     20 *	{
     21 *		if (n != 0) {
     22 *		    char *s1 = s;
     23 *		    const char *s2 = s0;
     24 *		    do {
     25 *			*s1++ = *s2++;
     26 *		    } while (--n != 0);
     27 *		}
     28 *		return (s);
     29 *	}
     30 *
     31 *
     32 * SPARC T7/M7 Flow :
     33 *
     34 * if (count < SMALL_MAX) {
     35 *   if count < SHORTCOPY              (SHORTCOPY=3)
     36 *	copy bytes; exit with dst addr
     37 *   if src & dst aligned on word boundary but not long word boundary,
     38 *     copy with ldw/stw; branch to finish_up
     39 *   if src & dst aligned on long word boundary
     40 *     copy with ldx/stx; branch to finish_up
     41 *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
     42 *     copy bytes; exit with dst addr
     43 *   move enough bytes to get src to word boundary
     44 *   if dst now on word boundary
     45 * move_words:
     46 *     copy words; branch to finish_up
     47 *   if dst now on half word boundary
     48 *     load words, shift half words, store words; branch to finish_up
     49 *   if dst on byte 1
     50 *     load words, shift 3 bytes, store words; branch to finish_up
     51 *   if dst on byte 3
     52 *     load words, shift 1 byte, store words; branch to finish_up
     53 * finish_up:
     54 *     copy bytes; exit with dst addr
     55 * } else {                                         More than SMALL_MAX bytes
     56 *   move bytes until dst is on long word boundary
     57 *   if( src is on long word boundary ) {
     58 *     if (count < MED_MAX) {
     59 * finish_long:					   src/dst aligned on 8 bytes
     60 *       copy with ldx/stx in 8-way unrolled loop;
     61 *       copy final 0-63 bytes; exit with dst addr
     62 *     } else {				     src/dst aligned; count > MED_MAX
     63 *       align dst on 64 byte boundary; for main data movement:
     64 *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
     65 *       Use BIS (block initializing store) to avoid copying store cache
     66 *       lines from memory. But pre-store first element of each cache line
     67 *       ST_CHUNK lines in advance of the rest of that cache line. That
     68 *       gives time for replacement cache lines to be written back without
     69 *       excess STQ and Miss Buffer filling. Repeat until near the end,
     70 *       then finish up storing before going to finish_long.
     71 *     }
     72 *   } else {                                   src/dst not aligned on 8 bytes
     73 *     if src is word aligned and count < MED_WMAX
     74 *       move words in 8-way unrolled loop
     75 *       move final 0-31 bytes; exit with dst addr
     76 *     if count < MED_UMAX
     77 *       use alignaddr/faligndata combined with ldd/std in 8-way
     78 *       unrolled loop to move data.
     79 *       go to unalign_done
     80 *     else
     81 *       setup alignaddr for faligndata instructions
     82 *       align dst on 64 byte boundary; prefetch src data to L1 cache
     83 *       loadx8, falign, block-store, prefetch loop
     84 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
     85 * unalign_done:
     86 *       move remaining bytes for unaligned cases. exit with dst addr.
     87 * }
     88 *
     89 */
     90
     91#include <asm/visasm.h>
     92#include <asm/asi.h>
     93
     94#if !defined(EX_LD) && !defined(EX_ST)
     95#define NON_USER_COPY
     96#endif
     97
     98#ifndef EX_LD
     99#define EX_LD(x,y)	x
    100#endif
    101#ifndef EX_LD_FP
    102#define EX_LD_FP(x,y)	x
    103#endif
    104
    105#ifndef EX_ST
    106#define EX_ST(x,y)	x
    107#endif
    108#ifndef EX_ST_FP
    109#define EX_ST_FP(x,y)	x
    110#endif
    111
    112#ifndef EX_RETVAL
    113#define EX_RETVAL(x)    x
    114#endif
    115
    116#ifndef LOAD
    117#define LOAD(type,addr,dest)	type [addr], dest
    118#endif
    119
    120#ifndef STORE
    121#define STORE(type,src,addr)	type src, [addr]
    122#endif
    123
    124/*
    125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
    126 * line as "least recently used" which means if many threads are
    127 * active, it has a high probability of being pushed out of the cache
    128 * between the first initializing store and the final stores.
    129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
    130 * marks the cache line as "most recently used" for all
    131 * but the last cache line
    132 */
    133#ifndef STORE_ASI
    134#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
    135#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
    136#else
    137#define STORE_ASI	0x80		/* ASI_P */
    138#endif
    139#endif
    140
    141#ifndef STORE_MRU_ASI
    142#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
    143#define STORE_MRU_ASI	ASI_ST_BLKINIT_MRU_P
    144#else
    145#define STORE_MRU_ASI	0x80		/* ASI_P */
    146#endif
    147#endif
    148
    149#ifndef STORE_INIT
    150#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
    151#endif
    152
    153#ifndef STORE_INIT_MRU
    154#define STORE_INIT_MRU(src,addr)	stxa src, [addr] STORE_MRU_ASI
    155#endif
    156
    157#ifndef FUNC_NAME
    158#define FUNC_NAME	M7memcpy
    159#endif
    160
    161#ifndef PREAMBLE
    162#define PREAMBLE
    163#endif
    164
    165#define	BLOCK_SIZE	64
    166#define	SHORTCOPY	3
    167#define	SHORTCHECK	14
    168#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
    169				/* must be at least 64 */
    170#define	SMALL_MAX	128
    171#define	MED_UMAX	1024	/* max copy for medium un-aligned case */
    172#define	MED_WMAX	1024	/* max copy for medium word-aligned case */
    173#define	MED_MAX		1024	/* max copy for medium longword-aligned case */
    174#define ST_CHUNK	24	/* ST_CHUNK - block of values for BIS Store */
    175#define ALIGN_PRE	24	/* distance for aligned prefetch loop */
    176
    177	.register	%g2,#scratch
    178
    179	.section	".text"
    180	.global		FUNC_NAME
    181	.type		FUNC_NAME, #function
    182	.align		16
    183FUNC_NAME:
    184	srlx            %o2, 31, %g2
    185	cmp             %g2, 0
    186	tne             %xcc, 5
    187	PREAMBLE
    188	mov		%o0, %g1	! save %o0
    189	brz,pn          %o2, .Lsmallx
    190	 cmp            %o2, 3
    191	ble,pn          %icc, .Ltiny_cp
    192	 cmp            %o2, 19
    193	ble,pn          %icc, .Lsmall_cp
    194	 or             %o0, %o1, %g2
    195	cmp             %o2, SMALL_MAX
    196	bl,pn           %icc, .Lmedium_cp
    197	 nop
    198
    199.Lmedium:
    200	neg	%o0, %o5
    201	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
    202	brz,pt	%o5, .Ldst_aligned_on_8
    203
    204	! %o5 has the bytes to be written in partial store.
    205	 sub	%o2, %o5, %o2
    206	sub	%o1, %o0, %o1		! %o1 gets the difference
    2077:					! dst aligning loop
    208	add	%o1, %o0, %o4
    209	EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)	! load one byte
    210	subcc	%o5, 1, %o5
    211	EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
    212	bgu,pt	%xcc, 7b
    213	 add	%o0, 1, %o0		! advance dst
    214	add	%o1, %o0, %o1		! restore %o1
    215.Ldst_aligned_on_8:
    216	andcc	%o1, 7, %o5
    217	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
    218	 nop
    219
    220.Lsrc_dst_aligned_on_8:
    221	! check if we are copying MED_MAX or more bytes
    222	set MED_MAX, %o3
    223	cmp %o2, %o3 			! limit to store buffer size
    224	bgu,pn	%xcc, .Llarge_align8_copy
    225	 nop
    226
    227/*
    228 * Special case for handling when src and dest are both long word aligned
    229 * and total data to move is less than MED_MAX bytes
    230 */
    231.Lmedlong:
    232	subcc	%o2, 63, %o2		! adjust length to allow cc test
    233	ble,pn	%xcc, .Lmedl63		! skip big loop if less than 64 bytes
    234	 nop
    235.Lmedl64:
    236	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)	! load
    237	subcc	%o2, 64, %o2		! decrement length count
    238	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)	! and store
    239	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56)	! a block of 64
    240	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
    241	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
    242	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
    243	EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
    244	EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
    245	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
    246	EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
    247	EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
    248	add	%o1, 64, %o1		! increase src ptr by 64
    249	EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
    250	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
    251	add	%o0, 64, %o0		! increase dst ptr by 64
    252	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
    253	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
    254	bgu,pt	%xcc, .Lmedl64		! repeat if at least 64 bytes left
    255	 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
    256.Lmedl63:
    257	addcc	%o2, 32, %o2		! adjust remaining count
    258	ble,pt	%xcc, .Lmedl31		! to skip if 31 or fewer bytes left
    259	 nop
    260	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)	! load
    261	sub	%o2, 32, %o2		! decrement length count
    262	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)	! and store
    263	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24)	! a block of 32
    264	add	%o1, 32, %o1		! increase src ptr by 32
    265	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
    266	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
    267	add	%o0, 32, %o0		! increase dst ptr by 32
    268	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
    269	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
    270	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
    271.Lmedl31:
    272	addcc	%o2, 16, %o2		! adjust remaining count
    273	ble,pt	%xcc, .Lmedl15		! skip if 15 or fewer bytes left
    274	 nop				!
    275	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
    276	add	%o1, 16, %o1		! increase src ptr by 16
    277	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
    278	sub	%o2, 16, %o2		! decrease count by 16
    279	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
    280	add	%o0, 16, %o0		! increase dst ptr by 16
    281	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
    282.Lmedl15:
    283	addcc	%o2, 15, %o2		! restore count
    284	bz,pt	%xcc, .Lsmallx	! exit if finished
    285	 cmp	%o2, 8
    286	blt,pt	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
    287	 tst	%o2
    288	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)	! load 8 bytes
    289	add	%o1, 8, %o1		! increase src ptr by 8
    290	add	%o0, 8, %o0		! increase dst ptr by 8
    291	subcc	%o2, 8, %o2		! decrease count by 8
    292	bnz,pn	%xcc, .Lmedw7
    293	 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)	! and store 8
    294	retl
    295	 mov	EX_RETVAL(%g1), %o0	! restore %o0
    296
    297	.align 16
    298.Lsrc_dst_unaligned_on_8:
    299	! DST is 8-byte aligned, src is not
    3002:
    301	andcc	%o1, 0x3, %o5		! test word alignment
    302	bnz,pt	%xcc, .Lunalignsetup	! branch to skip if not word aligned
    303	 nop
    304
    305/*
    306 * Handle all cases where src and dest are aligned on word
    307 * boundaries. Use unrolled loops for better performance.
    308 * This option wins over standard large data move when
    309 * source and destination is in cache for.Lmedium
    310 * to short data moves.
    311 */
    312	set MED_WMAX, %o3
    313	cmp %o2, %o3 			! limit to store buffer size
    314	bge,pt	%xcc, .Lunalignrejoin	! otherwise rejoin main loop
    315	 nop
    316
    317	subcc	%o2, 31, %o2		! adjust length to allow cc test
    318					! for end of loop
    319	ble,pt	%xcc, .Lmedw31		! skip big loop if less than 16
    320.Lmedw32:
    321	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
    322	sllx	%o4, 32, %o5
    323	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
    324	or	%o4, %o5, %o5
    325	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
    326	subcc	%o2, 32, %o2		! decrement length count
    327	EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
    328	sllx	%o4, 32, %o5
    329	EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
    330	or	%o4, %o5, %o5
    331	EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
    332	add	%o1, 32, %o1		! increase src ptr by 32
    333	EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
    334	sllx	%o4, 32, %o5
    335	EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
    336	or	%o4, %o5, %o5
    337	EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
    338	add	%o0, 32, %o0		! increase dst ptr by 32
    339	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
    340	sllx	%o4, 32, %o5
    341	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
    342	or	%o4, %o5, %o5
    343	bgu,pt	%xcc, .Lmedw32		! repeat if at least 32 bytes left
    344	 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
    345.Lmedw31:
    346	addcc	%o2, 31, %o2		! restore count
    347
    348	bz,pt	%xcc, .Lsmallx	! exit if finished
    349	 nop
    350	cmp	%o2, 16
    351	blt,pt	%xcc, .Lmedw15
    352	 nop
    353	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
    354	sllx	%o4, 32, %o5
    355	subcc	%o2, 16, %o2		! decrement length count
    356	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
    357	or	%o4, %o5, %o5
    358	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
    359	add	%o1, 16, %o1		! increase src ptr by 16
    360	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
    361	add	%o0, 16, %o0		! increase dst ptr by 16
    362	sllx	%o4, 32, %o5
    363	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
    364	or	%o4, %o5, %o5
    365	EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
    366.Lmedw15:
    367	bz,pt	%xcc, .Lsmallx	! exit if finished
    368	 cmp	%o2, 8
    369	blt,pn	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
    370	 tst	%o2
    371	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
    372	subcc	%o2, 8, %o2		! decrease count by 8
    373	EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
    374	add	%o1, 8, %o1		! increase src ptr by 8
    375	EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)	! load 4 bytes
    376	add	%o0, 8, %o0		! increase dst ptr by 8
    377	EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
    378	bz,pt	%xcc, .Lsmallx	! exit if finished
    379.Lmedw7:				! count is ge 1, less than 8
    380	cmp	%o2, 4			! check for 4 bytes left
    381	blt,pn	%xcc, .Lsmallleft3	! skip if 3 or fewer bytes left
    382	 nop				!
    383	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
    384	add	%o1, 4, %o1		! increase src ptr by 4
    385	add	%o0, 4, %o0		! increase dst ptr by 4
    386	subcc	%o2, 4, %o2		! decrease count by 4
    387	bnz	.Lsmallleft3
    388	 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
    389	retl
    390	 mov	EX_RETVAL(%g1), %o0
    391
    392	.align 16
    393.Llarge_align8_copy:			! Src and dst share 8 byte alignment
    394	! align dst to 64 byte boundary
    395	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
    396	brz,pn	%o3, .Laligned_to_64
    397	 andcc	%o0, 8, %o3		! odd long words to move?
    398	brz,pt	%o3, .Laligned_to_16
    399	 nop
    400	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
    401	sub	%o2, 8, %o2
    402	add	%o1, 8, %o1		! increment src ptr
    403	add	%o0, 8, %o0		! increment dst ptr
    404	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
    405.Laligned_to_16:
    406	andcc	%o0, 16, %o3		! pair of long words to move?
    407	brz,pt	%o3, .Laligned_to_32
    408	 nop
    409	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
    410	sub	%o2, 16, %o2
    411	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
    412	add	%o1, 16, %o1		! increment src ptr
    413	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
    414	add	%o0, 16, %o0		! increment dst ptr
    415	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
    416.Laligned_to_32:
    417	andcc	%o0, 32, %o3		! four long words to move?
    418	brz,pt	%o3, .Laligned_to_64
    419	 nop
    420	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
    421	sub	%o2, 32, %o2
    422	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
    423	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
    424	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
    425	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
    426	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
    427	add	%o1, 32, %o1		! increment src ptr
    428	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
    429	add	%o0, 32, %o0		! increment dst ptr
    430	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
    431.Laligned_to_64:
    432!
    433!	Using block init store (BIS) instructions to avoid fetching cache
    434!	lines from memory. Use ST_CHUNK stores to first element of each cache
    435!	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
    436!	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
    437!	Initial stores using MRU version of BIS to keep cache line in
    438!	cache until we are ready to store final element of cache line.
    439!	Then store last element using the LRU version of BIS.
    440!
    441	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
    442	and	%o2, 0x3f, %o2		! residue bytes in %o2
    443!
    444!	We use STORE_MRU_ASI for the first seven stores to each cache line
    445!	followed by STORE_ASI (mark as LRU) for the last store. That
    446!	mixed approach reduces the probability that the cache line is removed
    447!	before we finish setting it, while minimizing the effects on
    448!	other cached values during a large memcpy
    449!
    450!	ST_CHUNK batches up initial BIS operations for several cache lines
    451!	to allow multiple requests to not be blocked by overflowing the
    452!	the store miss buffer. Then the matching stores for all those
    453!	BIS operations are executed.
    454!
    455
    456	sub	%o0, 8, %o0		! adjust %o0 for ASI alignment
    457.Lalign_loop:
    458	cmp	%o5, ST_CHUNK*64
    459	blu,pt	%xcc, .Lalign_loop_fin
    460	 mov	ST_CHUNK,%o3
    461.Lalign_loop_start:
    462	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
    463	subcc	%o3, 1, %o3
    464	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
    465	add	%o1, 64, %o1
    466	add	%o0, 8, %o0
    467	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    468	bgu	%xcc,.Lalign_loop_start
    469	 add	%o0, 56, %o0
    470
    471	mov	ST_CHUNK,%o3
    472	sllx	%o3, 6, %o4		! ST_CHUNK*64
    473	sub	%o1, %o4, %o1		! reset %o1
    474	sub	%o0, %o4, %o0		! reset %o0
    475
    476.Lalign_loop_rest:
    477	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
    478	add	%o0, 16, %o0
    479	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    480	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
    481	add	%o0, 8, %o0
    482	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    483	subcc	%o3, 1, %o3
    484	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
    485	add	%o0, 8, %o0
    486	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    487	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
    488	add	%o0, 8, %o0
    489	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    490	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
    491	add	%o0, 8, %o0
    492	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    493	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
    494	add	%o1, 64, %o1
    495	add	%o0, 8, %o0
    496	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
    497	add	%o0, 8, %o0
    498	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
    499	sub	%o5, 64, %o5
    500	bgu	%xcc,.Lalign_loop_rest
    501	! mark cache line as LRU
    502	 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
    503
    504	cmp	%o5, ST_CHUNK*64
    505	bgu,pt	%xcc, .Lalign_loop_start
    506	 mov	ST_CHUNK,%o3
    507
    508	cmp	%o5, 0
    509	beq	.Lalign_done
    510	 nop
    511.Lalign_loop_fin:
    512	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
    513	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
    514	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
    515	EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
    516	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
    517	EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
    518	subcc	%o5, 64, %o5
    519	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
    520	EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
    521	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
    522	EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
    523	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
    524	EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
    525	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
    526	add	%o1, 64, %o1
    527	EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
    528	add	%o0, 64, %o0
    529	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
    530	bgu	%xcc,.Lalign_loop_fin
    531	 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
    532
    533.Lalign_done:
    534	add	%o0, 8, %o0		! restore %o0 from ASI alignment
    535	membar	#StoreStore
    536	sub	%o2, 63, %o2		! adjust length to allow cc test
    537	ba	.Lmedl63		! in .Lmedl63
    538	 nop
    539
    540	.align 16
    541	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
    542.Lunalignsetup:
    543.Lunalignrejoin:
    544	mov	%g1, %o3	! save %g1 as VISEntryHalf clobbers it
    545#ifdef NON_USER_COPY
    546	VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
    547#else
    548	VISEntryHalf
    549#endif
    550	mov	%o3, %g1	! restore %g1
    551
    552	set MED_UMAX, %o3
    553	cmp %o2, %o3 		! check for.Lmedium unaligned limit
    554	bge,pt	%xcc,.Lunalign_large
    555	 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
    556	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
    557	and	%o2, 0x3f, %o2		! residue bytes in %o2
    558	cmp	%o2, 8			! Insure we do not load beyond
    559	bgt	.Lunalign_adjust	! end of source buffer
    560	 andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
    561	add	%o2, 64, %o2		! adjust to leave loop
    562	sub	%o5, 64, %o5		! early if necessary
    563.Lunalign_adjust:
    564	alignaddr %o1, %g0, %g0		! generate %gsr
    565	add	%o1, %o5, %o1		! advance %o1 to after blocks
    566	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
    567.Lunalign_loop:
    568	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
    569	faligndata %f0, %f2, %f16
    570	EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
    571	subcc	%o5, BLOCK_SIZE, %o5
    572	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
    573	faligndata %f2, %f4, %f18
    574	EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
    575	EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
    576	faligndata %f4, %f6, %f20
    577	EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
    578	EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
    579	faligndata %f6, %f8, %f22
    580	EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
    581	EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
    582	faligndata %f8, %f10, %f24
    583	EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
    584	EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
    585	faligndata %f10, %f12, %f26
    586	EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
    587	add	%o4, BLOCK_SIZE, %o4
    588	EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
    589	faligndata %f12, %f14, %f28
    590	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
    591	EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
    592	faligndata %f14, %f0, %f30
    593	EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
    594	add	%o0, BLOCK_SIZE, %o0
    595	bgu,pt	%xcc, .Lunalign_loop
    596	 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
    597	ba	.Lunalign_done
    598	 nop
    599
    600.Lunalign_large:
    601	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
    602	bz	%xcc, .Lunalignsrc
    603	 sub	%o3, 64, %o3		! %o3 will be multiple of 8
    604	neg	%o3			! bytes until dest is 64 byte aligned
    605	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
    606	! Move bytes according to source alignment
    607	andcc	%o1, 0x1, %o5
    608	bnz	%xcc, .Lunalignbyte	! check for byte alignment
    609	 nop
    610	andcc	%o1, 2, %o5		! check for half word alignment
    611	bnz	%xcc, .Lunalignhalf
    612	 nop
    613	! Src is word aligned
    614.Lunalignword:
    615	EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 4 bytes
    616	add	%o1, 8, %o1		! increase src ptr by 8
    617	EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)	! and store 4
    618	subcc	%o3, 8, %o3		! decrease count by 8
    619	EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
    620	add	%o0, 8, %o0		! increase dst ptr by 8
    621	bnz	%xcc, .Lunalignword
    622	 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
    623	ba	.Lunalignsrc
    624	 nop
    625
    626	! Src is half-word aligned
    627.Lunalignhalf:
    628	EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 2 bytes
    629	sllx	%o4, 32, %o5		! shift left
    630	EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
    631	or	%o4, %o5, %o5
    632	sllx	%o5, 16, %o5
    633	EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
    634	or	%o4, %o5, %o5
    635	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
    636	add	%o1, 8, %o1
    637	subcc	%o3, 8, %o3
    638	bnz	%xcc, .Lunalignhalf
    639	 add	%o0, 8, %o0
    640	ba	.Lunalignsrc
    641	 nop
    642
    643	! Src is Byte aligned
    644.Lunalignbyte:
    645	sub	%o0, %o1, %o0		! share pointer advance
    646.Lunalignbyte_loop:
    647	EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
    648	sllx	%o4, 56, %o5
    649	EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
    650	sllx	%o4, 40, %o4
    651	or	%o4, %o5, %o5
    652	EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
    653	sllx	%o4, 24, %o4
    654	or	%o4, %o5, %o5
    655	EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
    656	sllx	%o4,  8, %o4
    657	or	%o4, %o5, %o5
    658	EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
    659	or	%o4, %o5, %o5
    660	add	%o0, %o1, %o0
    661	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
    662	sub	%o0, %o1, %o0
    663	subcc	%o3, 8, %o3
    664	bnz	%xcc, .Lunalignbyte_loop
    665	 add	%o1, 8, %o1
    666	add	%o0,%o1, %o0 		! restore pointer
    667
    668	! Destination is now block (64 byte aligned)
    669.Lunalignsrc:
    670	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
    671	and	%o2, 0x3f, %o2		! residue bytes in %o2
    672	add	%o2, 64, %o2		! Insure we do not load beyond
    673	sub	%o5, 64, %o5		! end of source buffer
    674
    675	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
    676	alignaddr %o1, %g0, %g0		! generate %gsr
    677	add	%o1, %o5, %o1		! advance %o1 to after blocks
    678
    679	EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
    680	add	%o4, 8, %o4
    681.Lunalign_sloop:
    682	EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
    683	faligndata %f14, %f16, %f0
    684	EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
    685	faligndata %f16, %f18, %f2
    686	EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
    687	faligndata %f18, %f20, %f4
    688	EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
    689	subcc	%o5, 64, %o5
    690	EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
    691	faligndata %f20, %f22, %f6
    692	EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
    693	EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
    694	faligndata %f22, %f24, %f8
    695	EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
    696	EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
    697	faligndata %f24, %f26, %f10
    698	EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
    699	EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
    700	faligndata %f26, %f28, %f12
    701	EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
    702	add	%o4, 64, %o4
    703	EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
    704	faligndata %f28, %f30, %f14
    705	EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
    706	EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
    707	add	%o0, 64, %o0
    708	EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
    709	fsrc2	%f30, %f14
    710	bgu,pt	%xcc, .Lunalign_sloop
    711	 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
    712
    713.Lunalign_done:
    714	! Handle trailing bytes, 64 to 127
    715	! Dest long word aligned, Src not long word aligned
    716	cmp	%o2, 15
    717	bleu	%xcc, .Lunalign_short
    718
    719	 andn	%o2, 0x7, %o5		! %o5 is multiple of 8
    720	and	%o2, 0x7, %o2		! residue bytes in %o2
    721	add	%o2, 8, %o2
    722	sub	%o5, 8, %o5		! insure we do not load past end of src
    723	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
    724	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
    725	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
    726.Lunalign_by8:
    727	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
    728	add	%o4, 8, %o4
    729	faligndata %f0, %f2, %f16
    730	subcc	%o5, 8, %o5
    731	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
    732	fsrc2	%f2, %f0
    733	bgu,pt	%xcc, .Lunalign_by8
    734	 add	%o0, 8, %o0
    735
    736.Lunalign_short:
    737#ifdef NON_USER_COPY
    738	VISExitHalfFast
    739#else
    740	VISExitHalf
    741#endif
    742	ba	.Lsmallrest
    743	 nop
    744
    745/*
    746 * This is a special case of nested memcpy. This can happen when kernel
    747 * calls unaligned memcpy back to back without saving FP registers. We need
    748 * traps(context switch) to save/restore FP registers. If the kernel calls
    749 * memcpy without this trap sequence we will hit FP corruption. Let's use
    750 * the normal integer load/store method in this case.
    751 */
    752
    753#ifdef NON_USER_COPY
    754.Lmedium_vis_entry_fail_cp:
    755	or	%o0, %o1, %g2
    756#endif
    757.Lmedium_cp:
    758	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
    759	andcc	%g2, 0x7, %g0
    760	bne,pn	%xcc, .Lmedium_unaligned_cp
    761	 nop
    762
    763.Lmedium_noprefetch_cp:
    764	andncc	%o2, 0x20 - 1, %o5
    765	be,pn	%xcc, 2f
    766	 sub	%o2, %o5, %o2
    7671:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
    768	EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
    769	EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
    770	EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
    771	add	%o1, 0x20, %o1
    772	subcc	%o5, 0x20, %o5
    773	EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
    774	EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
    775	EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
    776	EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
    777	bne,pt	%xcc, 1b
    778	 add	%o0, 0x20, %o0
    7792:	andcc	%o2, 0x18, %o5
    780	be,pt	%xcc, 3f
    781	 sub	%o2, %o5, %o2
    7821:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
    783	add	%o1, 0x08, %o1
    784	add	%o0, 0x08, %o0
    785	subcc	%o5, 0x08, %o5
    786	bne,pt	%xcc, 1b
    787	 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
    7883:	brz,pt	%o2, .Lexit_cp
    789	 cmp	%o2, 0x04
    790	bl,pn	%xcc, .Ltiny_cp
    791	 nop
    792	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
    793	add	%o1, 0x04, %o1
    794	add	%o0, 0x04, %o0
    795	subcc	%o2, 0x04, %o2
    796	bne,pn	%xcc, .Ltiny_cp
    797	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
    798	ba,a,pt	%xcc, .Lexit_cp
    799
    800.Lmedium_unaligned_cp:
    801	/* First get dest 8 byte aligned.  */
    802	sub	%g0, %o0, %o3
    803	and	%o3, 0x7, %o3
    804	brz,pt	%o3, 2f
    805	 sub	%o2, %o3, %o2
    806
    8071:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
    808	add	%o1, 1, %o1
    809	subcc	%o3, 1, %o3
    810	add	%o0, 1, %o0
    811	bne,pt	%xcc, 1b
    812	 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
    8132:
    814	and	%o1, 0x7, %o3
    815	brz,pn	%o3, .Lmedium_noprefetch_cp
    816	 sll	%o3, 3, %o3
    817	mov	64, %g2
    818	sub	%g2, %o3, %g2
    819	andn	%o1, 0x7, %o1
    820	EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
    821	sllx	%o4, %o3, %o4
    822	andn	%o2, 0x08 - 1, %o5
    823	sub	%o2, %o5, %o2
    824
    8251:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
    826	add	%o1, 0x08, %o1
    827	subcc	%o5, 0x08, %o5
    828	srlx	%g3, %g2, %g7
    829	or	%g7, %o4, %g7
    830	EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
    831	add	%o0, 0x08, %o0
    832	bne,pt	%xcc, 1b
    833	 sllx	%g3, %o3, %o4
    834	srl	%o3, 3, %o3
    835	add	%o1, %o3, %o1
    836	brz,pn	%o2, .Lexit_cp
    837	 nop
    838	ba,pt	%xcc, .Lsmall_unaligned_cp
    839
    840.Ltiny_cp:
    841	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
    842	subcc	%o2, 1, %o2
    843	be,pn	%xcc, .Lexit_cp
    844	 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
    845	EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
    846	subcc	%o2, 1, %o2
    847	be,pn	%xcc, .Lexit_cp
    848	 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
    849	EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
    850	ba,pt	%xcc, .Lexit_cp
    851	 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
    852
    853.Lsmall_cp:
    854	andcc	%g2, 0x3, %g0
    855	bne,pn	%xcc, .Lsmall_unaligned_cp
    856	 andn	%o2, 0x4 - 1, %o5
    857	sub	%o2, %o5, %o2
    8581:
    859	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
    860	add	%o1, 0x04, %o1
    861	subcc	%o5, 0x04, %o5
    862	add	%o0, 0x04, %o0
    863	bne,pt	%xcc, 1b
    864	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
    865	brz,pt	%o2, .Lexit_cp
    866	 nop
    867	ba,a,pt	%xcc, .Ltiny_cp
    868
    869.Lsmall_unaligned_cp:
    8701:	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
    871	add	%o1, 1, %o1
    872	add	%o0, 1, %o0
    873	subcc	%o2, 1, %o2
    874	bne,pt	%xcc, 1b
    875	 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
    876	ba,a,pt	%xcc, .Lexit_cp
    877
    878.Lsmallrest:
    879	tst	%o2
    880	bz,pt	%xcc, .Lsmallx
    881	 cmp	%o2, 4
    882	blt,pn	%xcc, .Lsmallleft3
    883	 nop
    884	sub	%o2, 3, %o2
    885.Lsmallnotalign4:
    886	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
    887	subcc	%o2, 4, %o2		! reduce count by 4
    888	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
    889	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
    890	add	%o1, 4, %o1		! advance SRC by 4
    891	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
    892	EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
    893	add	%o0, 4, %o0		! advance DST by 4
    894	EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
    895	EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
    896	bgu,pt	%xcc, .Lsmallnotalign4	! loop til 3 or fewer bytes remain
    897	EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
    898	addcc	%o2, 3, %o2		! restore count
    899	bz,pt	%xcc, .Lsmallx
    900.Lsmallleft3:				! 1, 2, or 3 bytes remain
    901	subcc	%o2, 1, %o2
    902	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)	! load one byte
    903	bz,pt	%xcc, .Lsmallx
    904	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)	! store one byte
    905	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)	! load second byte
    906	subcc	%o2, 1, %o2
    907	bz,pt	%xcc, .Lsmallx
    908	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
    909	EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)	! load third byte
    910	EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)	! store third byte
    911.Lsmallx:
    912	retl
    913	 mov	EX_RETVAL(%g1), %o0
    914.Lsmallfin:
    915	tst	%o2
    916	bnz,pn	%xcc, .Lsmallleft3
    917	 nop
    918	retl
    919	 mov	EX_RETVAL(%g1), %o0	! restore %o0
    920.Lexit_cp:
    921	retl
    922	 mov	EX_RETVAL(%g1), %o0
    923	.size  FUNC_NAME, .-FUNC_NAME