cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

fastcopy.S (20680B)


      1/*
      2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
      3 * Copyright (C) 2008-2009 PetaLogix
      4 * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
      5 *
      6 * This file is subject to the terms and conditions of the GNU General
      7 * Public License.  See the file COPYING in the main directory of this
      8 * archive for more details.
      9 *
     10 * Written by Jim Law <jlaw@irispower.com>
     11 *
     12 * intended to replace:
     13 *	memcpy in memcpy.c and
     14 *	memmove in memmove.c
     15 * ... in arch/microblaze/lib
     16 *
     17 *
     18 * assly_fastcopy.S
     19 *
     20 * Attempt at quicker memcpy and memmove for MicroBlaze
     21 *	Input :	Operand1 in Reg r5 - destination address
     22 *		Operand2 in Reg r6 - source address
     23 *		Operand3 in Reg r7 - number of bytes to transfer
     24 *	Output: Result in Reg r3 - starting destinaition address
     25 *
     26 *
     27 * Explanation:
     28 *	Perform (possibly unaligned) copy of a block of memory
     29 *	between mem locations with size of xfer spec'd in bytes
     30 */
     31
     32#include <linux/linkage.h>
     33	.text
     34	.globl	memcpy
     35	.type  memcpy, @function
     36	.ent	memcpy
     37
     38memcpy:
     39fast_memcpy_ascending:
     40	/* move d to return register as value of function */
     41	addi	r3, r5, 0
     42
     43	addi	r4, r0, 4	/* n = 4 */
     44	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
     45	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
     46
     47	/* transfer first 0~3 bytes to get aligned dest address */
     48	andi	r4, r5, 3		/* n = d & 3 */
     49	/* if zero, destination already aligned */
     50	beqi	r4, a_dalign_done
     51	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
     52	rsubi	r4, r4, 4
     53	rsub	r7, r4, r7		/* c = c - n adjust c */
     54
     55a_xfer_first_loop:
     56	/* if no bytes left to transfer, transfer the bulk */
     57	beqi	r4, a_dalign_done
     58	lbui	r11, r6, 0		/* h = *s */
     59	sbi	r11, r5, 0		/* *d = h */
     60	addi	r6, r6, 1		/* s++ */
     61	addi	r5, r5, 1		/* d++ */
     62	brid	a_xfer_first_loop	/* loop */
     63	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
     64
     65a_dalign_done:
     66	addi	r4, r0, 32		/* n = 32 */
     67	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
     68	/* if n < 0, less than one block to transfer */
     69	blti	r4, a_block_done
     70
     71a_block_xfer:
     72	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
     73	rsub	r7, r4, r7		/* c = c - n */
     74
     75	andi	r9, r6, 3		/* t1 = s & 3 */
     76	/* if temp != 0, unaligned transfers needed */
     77	bnei	r9, a_block_unaligned
     78
     79a_block_aligned:
     80	lwi	r9, r6, 0		/* t1 = *(s + 0) */
     81	lwi	r10, r6, 4		/* t2 = *(s + 4) */
     82	lwi	r11, r6, 8		/* t3 = *(s + 8) */
     83	lwi	r12, r6, 12		/* t4 = *(s + 12) */
     84	swi	r9, r5, 0		/* *(d + 0) = t1 */
     85	swi	r10, r5, 4		/* *(d + 4) = t2 */
     86	swi	r11, r5, 8		/* *(d + 8) = t3 */
     87	swi	r12, r5, 12		/* *(d + 12) = t4 */
     88	lwi	r9, r6, 16		/* t1 = *(s + 16) */
     89	lwi	r10, r6, 20		/* t2 = *(s + 20) */
     90	lwi	r11, r6, 24		/* t3 = *(s + 24) */
     91	lwi	r12, r6, 28		/* t4 = *(s + 28) */
     92	swi	r9, r5, 16		/* *(d + 16) = t1 */
     93	swi	r10, r5, 20		/* *(d + 20) = t2 */
     94	swi	r11, r5, 24		/* *(d + 24) = t3 */
     95	swi	r12, r5, 28		/* *(d + 28) = t4 */
     96	addi	r6, r6, 32		/* s = s + 32 */
     97	addi	r4, r4, -32		/* n = n - 32 */
     98	bneid	r4, a_block_aligned	/* while (n) loop */
     99	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
    100	bri	a_block_done
    101
    102a_block_unaligned:
    103	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
    104	add	r6, r6, r4		/* s = s + n */
    105	lwi	r11, r8, 0		/* h = *(as + 0) */
    106
    107	addi	r9, r9, -1
    108	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
    109	addi	r9, r9, -1
    110	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
    111
    112a_block_u3:
    113	bslli	r11, r11, 24	/* h = h << 24 */
    114a_bu3_loop:
    115	lwi	r12, r8, 4	/* v = *(as + 4) */
    116	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    117	or	r9, r11, r9	/* t1 = h | t1 */
    118	swi	r9, r5, 0	/* *(d + 0) = t1 */
    119	bslli	r11, r12, 24	/* h = v << 24 */
    120	lwi	r12, r8, 8	/* v = *(as + 8) */
    121	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    122	or	r9, r11, r9	/* t1 = h | t1 */
    123	swi	r9, r5, 4	/* *(d + 4) = t1 */
    124	bslli	r11, r12, 24	/* h = v << 24 */
    125	lwi	r12, r8, 12	/* v = *(as + 12) */
    126	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    127	or	r9, r11, r9	/* t1 = h | t1 */
    128	swi	r9, r5, 8	/* *(d + 8) = t1 */
    129	bslli	r11, r12, 24	/* h = v << 24 */
    130	lwi	r12, r8, 16	/* v = *(as + 16) */
    131	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    132	or	r9, r11, r9	/* t1 = h | t1 */
    133	swi	r9, r5, 12	/* *(d + 12) = t1 */
    134	bslli	r11, r12, 24	/* h = v << 24 */
    135	lwi	r12, r8, 20	/* v = *(as + 20) */
    136	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    137	or	r9, r11, r9	/* t1 = h | t1 */
    138	swi	r9, r5, 16	/* *(d + 16) = t1 */
    139	bslli	r11, r12, 24	/* h = v << 24 */
    140	lwi	r12, r8, 24	/* v = *(as + 24) */
    141	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    142	or	r9, r11, r9	/* t1 = h | t1 */
    143	swi	r9, r5, 20	/* *(d + 20) = t1 */
    144	bslli	r11, r12, 24	/* h = v << 24 */
    145	lwi	r12, r8, 28	/* v = *(as + 28) */
    146	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    147	or	r9, r11, r9	/* t1 = h | t1 */
    148	swi	r9, r5, 24	/* *(d + 24) = t1 */
    149	bslli	r11, r12, 24	/* h = v << 24 */
    150	lwi	r12, r8, 32	/* v = *(as + 32) */
    151	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    152	or	r9, r11, r9	/* t1 = h | t1 */
    153	swi	r9, r5, 28	/* *(d + 28) = t1 */
    154	bslli	r11, r12, 24	/* h = v << 24 */
    155	addi	r8, r8, 32	/* as = as + 32 */
    156	addi	r4, r4, -32	/* n = n - 32 */
    157	bneid	r4, a_bu3_loop	/* while (n) loop */
    158	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
    159	bri	a_block_done
    160
    161a_block_u1:
    162	bslli	r11, r11, 8	/* h = h << 8 */
    163a_bu1_loop:
    164	lwi	r12, r8, 4	/* v = *(as + 4) */
    165	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    166	or	r9, r11, r9	/* t1 = h | t1 */
    167	swi	r9, r5, 0	/* *(d + 0) = t1 */
    168	bslli	r11, r12, 8	/* h = v << 8 */
    169	lwi	r12, r8, 8	/* v = *(as + 8) */
    170	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    171	or	r9, r11, r9	/* t1 = h | t1 */
    172	swi	r9, r5, 4	/* *(d + 4) = t1 */
    173	bslli	r11, r12, 8	/* h = v << 8 */
    174	lwi	r12, r8, 12	/* v = *(as + 12) */
    175	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    176	or	r9, r11, r9	/* t1 = h | t1 */
    177	swi	r9, r5, 8	/* *(d + 8) = t1 */
    178	bslli	r11, r12, 8	/* h = v << 8 */
    179	lwi	r12, r8, 16	/* v = *(as + 16) */
    180	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    181	or	r9, r11, r9	/* t1 = h | t1 */
    182	swi	r9, r5, 12	/* *(d + 12) = t1 */
    183	bslli	r11, r12, 8	/* h = v << 8 */
    184	lwi	r12, r8, 20	/* v = *(as + 20) */
    185	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    186	or	r9, r11, r9	/* t1 = h | t1 */
    187	swi	r9, r5, 16	/* *(d + 16) = t1 */
    188	bslli	r11, r12, 8	/* h = v << 8 */
    189	lwi	r12, r8, 24	/* v = *(as + 24) */
    190	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    191	or	r9, r11, r9	/* t1 = h | t1 */
    192	swi	r9, r5, 20	/* *(d + 20) = t1 */
    193	bslli	r11, r12, 8	/* h = v << 8 */
    194	lwi	r12, r8, 28	/* v = *(as + 28) */
    195	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    196	or	r9, r11, r9	/* t1 = h | t1 */
    197	swi	r9, r5, 24	/* *(d + 24) = t1 */
    198	bslli	r11, r12, 8	/* h = v << 8 */
    199	lwi	r12, r8, 32	/* v = *(as + 32) */
    200	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    201	or	r9, r11, r9	/* t1 = h | t1 */
    202	swi	r9, r5, 28	/* *(d + 28) = t1 */
    203	bslli	r11, r12, 8	/* h = v << 8 */
    204	addi	r8, r8, 32	/* as = as + 32 */
    205	addi	r4, r4, -32	/* n = n - 32 */
    206	bneid	r4, a_bu1_loop	/* while (n) loop */
    207	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
    208	bri	a_block_done
    209
    210a_block_u2:
    211	bslli	r11, r11, 16	/* h = h << 16 */
    212a_bu2_loop:
    213	lwi	r12, r8, 4	/* v = *(as + 4) */
    214	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    215	or	r9, r11, r9	/* t1 = h | t1 */
    216	swi	r9, r5, 0	/* *(d + 0) = t1 */
    217	bslli	r11, r12, 16	/* h = v << 16 */
    218	lwi	r12, r8, 8	/* v = *(as + 8) */
    219	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    220	or	r9, r11, r9	/* t1 = h | t1 */
    221	swi	r9, r5, 4	/* *(d + 4) = t1 */
    222	bslli	r11, r12, 16	/* h = v << 16 */
    223	lwi	r12, r8, 12	/* v = *(as + 12) */
    224	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    225	or	r9, r11, r9	/* t1 = h | t1 */
    226	swi	r9, r5, 8	/* *(d + 8) = t1 */
    227	bslli	r11, r12, 16	/* h = v << 16 */
    228	lwi	r12, r8, 16	/* v = *(as + 16) */
    229	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    230	or	r9, r11, r9	/* t1 = h | t1 */
    231	swi	r9, r5, 12	/* *(d + 12) = t1 */
    232	bslli	r11, r12, 16	/* h = v << 16 */
    233	lwi	r12, r8, 20	/* v = *(as + 20) */
    234	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    235	or	r9, r11, r9	/* t1 = h | t1 */
    236	swi	r9, r5, 16	/* *(d + 16) = t1 */
    237	bslli	r11, r12, 16	/* h = v << 16 */
    238	lwi	r12, r8, 24	/* v = *(as + 24) */
    239	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    240	or	r9, r11, r9	/* t1 = h | t1 */
    241	swi	r9, r5, 20	/* *(d + 20) = t1 */
    242	bslli	r11, r12, 16	/* h = v << 16 */
    243	lwi	r12, r8, 28	/* v = *(as + 28) */
    244	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    245	or	r9, r11, r9	/* t1 = h | t1 */
    246	swi	r9, r5, 24	/* *(d + 24) = t1 */
    247	bslli	r11, r12, 16	/* h = v << 16 */
    248	lwi	r12, r8, 32	/* v = *(as + 32) */
    249	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    250	or	r9, r11, r9	/* t1 = h | t1 */
    251	swi	r9, r5, 28	/* *(d + 28) = t1 */
    252	bslli	r11, r12, 16	/* h = v << 16 */
    253	addi	r8, r8, 32	/* as = as + 32 */
    254	addi	r4, r4, -32	/* n = n - 32 */
    255	bneid	r4, a_bu2_loop	/* while (n) loop */
    256	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
    257
    258a_block_done:
    259	addi	r4, r0, 4	/* n = 4 */
    260	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
    261	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
    262
    263a_word_xfer:
    264	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
    265	addi	r10, r0, 0		/* offset = 0 */
    266
    267	andi	r9, r6, 3		/* t1 = s & 3 */
    268	/* if temp != 0, unaligned transfers needed */
    269	bnei	r9, a_word_unaligned
    270
    271a_word_aligned:
    272	lw	r9, r6, r10		/* t1 = *(s+offset) */
    273	sw	r9, r5, r10		/* *(d+offset) = t1 */
    274	addi	r4, r4,-4		/* n-- */
    275	bneid	r4, a_word_aligned	/* loop */
    276	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
    277
    278	bri	a_word_done
    279
    280a_word_unaligned:
    281	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
    282	lwi	r11, r8, 0		/* h = *(as + 0) */
    283	addi	r8, r8, 4		/* as = as + 4 */
    284
    285	addi	r9, r9, -1
    286	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
    287	addi	r9, r9, -1
    288	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
    289
    290a_word_u3:
    291	bslli	r11, r11, 24	/* h = h << 24 */
    292a_wu3_loop:
    293	lw	r12, r8, r10	/* v = *(as + offset) */
    294	bsrli	r9, r12, 8	/* t1 = v >> 8 */
    295	or	r9, r11, r9	/* t1 = h | t1 */
    296	sw	r9, r5, r10	/* *(d + offset) = t1 */
    297	bslli	r11, r12, 24	/* h = v << 24 */
    298	addi	r4, r4,-4	/* n = n - 4 */
    299	bneid	r4, a_wu3_loop	/* while (n) loop */
    300	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
    301
    302	bri	a_word_done
    303
    304a_word_u1:
    305	bslli	r11, r11, 8	/* h = h << 8 */
    306a_wu1_loop:
    307	lw	r12, r8, r10	/* v = *(as + offset) */
    308	bsrli	r9, r12, 24	/* t1 = v >> 24 */
    309	or	r9, r11, r9	/* t1 = h | t1 */
    310	sw	r9, r5, r10	/* *(d + offset) = t1 */
    311	bslli	r11, r12, 8	/* h = v << 8 */
    312	addi	r4, r4,-4	/* n = n - 4 */
    313	bneid	r4, a_wu1_loop	/* while (n) loop */
    314	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
    315
    316	bri	a_word_done
    317
    318a_word_u2:
    319	bslli	r11, r11, 16	/* h = h << 16 */
    320a_wu2_loop:
    321	lw	r12, r8, r10	/* v = *(as + offset) */
    322	bsrli	r9, r12, 16	/* t1 = v >> 16 */
    323	or	r9, r11, r9	/* t1 = h | t1 */
    324	sw	r9, r5, r10	/* *(d + offset) = t1 */
    325	bslli	r11, r12, 16	/* h = v << 16 */
    326	addi	r4, r4,-4	/* n = n - 4 */
    327	bneid	r4, a_wu2_loop	/* while (n) loop */
    328	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
    329
    330a_word_done:
    331	add	r5, r5, r10	/* d = d + offset */
    332	add	r6, r6, r10	/* s = s + offset */
    333	rsub	r7, r10, r7	/* c = c - offset */
    334
    335a_xfer_end:
    336a_xfer_end_loop:
    337	beqi	r7, a_done		/* while (c) */
    338	lbui	r9, r6, 0		/* t1 = *s */
    339	addi	r6, r6, 1		/* s++ */
    340	sbi	r9, r5, 0		/* *d = t1 */
    341	addi	r7, r7, -1		/* c-- */
    342	brid	a_xfer_end_loop		/* loop */
    343	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
    344
    345a_done:
    346	rtsd	r15, 8
    347	nop
    348
    349.size  memcpy, . - memcpy
    350.end memcpy
    351/*----------------------------------------------------------------------------*/
    352	.globl	memmove
    353	.type  memmove, @function
    354	.ent	memmove
    355
    356memmove:
    357	cmpu	r4, r5, r6	/* n = s - d */
    358	bgei	r4,fast_memcpy_ascending
    359
    360fast_memcpy_descending:
    361	/* move d to return register as value of function */
    362	addi	r3, r5, 0
    363
    364	add	r5, r5, r7	/* d = d + c */
    365	add	r6, r6, r7	/* s = s + c */
    366
    367	addi	r4, r0, 4	/* n = 4 */
    368	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
    369	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
    370
    371	/* transfer first 0~3 bytes to get aligned dest address */
    372	andi	r4, r5, 3		/* n = d & 3 */
    373	/* if zero, destination already aligned */
    374	beqi	r4,d_dalign_done
    375	rsub	r7, r4, r7		/* c = c - n adjust c */
    376
    377d_xfer_first_loop:
    378	/* if no bytes left to transfer, transfer the bulk */
    379	beqi	r4,d_dalign_done
    380	addi	r6, r6, -1		/* s-- */
    381	addi	r5, r5, -1		/* d-- */
    382	lbui	r11, r6, 0		/* h = *s */
    383	sbi	r11, r5, 0		/* *d = h */
    384	brid	d_xfer_first_loop	/* loop */
    385	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
    386
    387d_dalign_done:
    388	addi	r4, r0, 32	/* n = 32 */
    389	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
    390	/* if n < 0, less than one block to transfer */
    391	blti	r4, d_block_done
    392
    393d_block_xfer:
    394	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
    395	rsub	r7, r4, r7		/* c = c - n */
    396
    397	andi	r9, r6, 3		/* t1 = s & 3 */
    398	/* if temp != 0, unaligned transfers needed */
    399	bnei	r9, d_block_unaligned
    400
    401d_block_aligned:
    402	addi	r6, r6, -32		/* s = s - 32 */
    403	addi	r5, r5, -32		/* d = d - 32 */
    404	lwi	r9, r6, 28		/* t1 = *(s + 28) */
    405	lwi	r10, r6, 24		/* t2 = *(s + 24) */
    406	lwi	r11, r6, 20		/* t3 = *(s + 20) */
    407	lwi	r12, r6, 16		/* t4 = *(s + 16) */
    408	swi	r9, r5, 28		/* *(d + 28) = t1 */
    409	swi	r10, r5, 24		/* *(d + 24) = t2 */
    410	swi	r11, r5, 20		/* *(d + 20) = t3 */
    411	swi	r12, r5, 16		/* *(d + 16) = t4 */
    412	lwi	r9, r6, 12		/* t1 = *(s + 12) */
    413	lwi	r10, r6, 8		/* t2 = *(s + 8) */
    414	lwi	r11, r6, 4		/* t3 = *(s + 4) */
    415	lwi	r12, r6, 0		/* t4 = *(s + 0) */
    416	swi	r9, r5, 12		/* *(d + 12) = t1 */
    417	swi	r10, r5, 8		/* *(d + 8) = t2 */
    418	swi	r11, r5, 4		/* *(d + 4) = t3 */
    419	addi	r4, r4, -32		/* n = n - 32 */
    420	bneid	r4, d_block_aligned	/* while (n) loop */
    421	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
    422	bri	d_block_done
    423
    424d_block_unaligned:
    425	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
    426	rsub	r6, r4, r6		/* s = s - n */
    427	lwi	r11, r8, 0		/* h = *(as + 0) */
    428
    429	addi	r9, r9, -1
    430	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
    431	addi	r9, r9, -1
    432	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
    433
    434d_block_u3:
    435	bsrli	r11, r11, 8	/* h = h >> 8 */
    436d_bu3_loop:
    437	addi	r8, r8, -32	/* as = as - 32 */
    438	addi	r5, r5, -32	/* d = d - 32 */
    439	lwi	r12, r8, 28	/* v = *(as + 28) */
    440	bslli	r9, r12, 24	/* t1 = v << 24 */
    441	or	r9, r11, r9	/* t1 = h | t1 */
    442	swi	r9, r5, 28	/* *(d + 28) = t1 */
    443	bsrli	r11, r12, 8	/* h = v >> 8 */
    444	lwi	r12, r8, 24	/* v = *(as + 24) */
    445	bslli	r9, r12, 24	/* t1 = v << 24 */
    446	or	r9, r11, r9	/* t1 = h | t1 */
    447	swi	r9, r5, 24	/* *(d + 24) = t1 */
    448	bsrli	r11, r12, 8	/* h = v >> 8 */
    449	lwi	r12, r8, 20	/* v = *(as + 20) */
    450	bslli	r9, r12, 24	/* t1 = v << 24 */
    451	or	r9, r11, r9	/* t1 = h | t1 */
    452	swi	r9, r5, 20	/* *(d + 20) = t1 */
    453	bsrli	r11, r12, 8	/* h = v >> 8 */
    454	lwi	r12, r8, 16	/* v = *(as + 16) */
    455	bslli	r9, r12, 24	/* t1 = v << 24 */
    456	or	r9, r11, r9	/* t1 = h | t1 */
    457	swi	r9, r5, 16	/* *(d + 16) = t1 */
    458	bsrli	r11, r12, 8	/* h = v >> 8 */
    459	lwi	r12, r8, 12	/* v = *(as + 12) */
    460	bslli	r9, r12, 24	/* t1 = v << 24 */
    461	or	r9, r11, r9	/* t1 = h | t1 */
    462	swi	r9, r5, 12	/* *(d + 112) = t1 */
    463	bsrli	r11, r12, 8	/* h = v >> 8 */
    464	lwi	r12, r8, 8	/* v = *(as + 8) */
    465	bslli	r9, r12, 24	/* t1 = v << 24 */
    466	or	r9, r11, r9	/* t1 = h | t1 */
    467	swi	r9, r5, 8	/* *(d + 8) = t1 */
    468	bsrli	r11, r12, 8	/* h = v >> 8 */
    469	lwi	r12, r8, 4	/* v = *(as + 4) */
    470	bslli	r9, r12, 24	/* t1 = v << 24 */
    471	or	r9, r11, r9	/* t1 = h | t1 */
    472	swi	r9, r5, 4	/* *(d + 4) = t1 */
    473	bsrli	r11, r12, 8	/* h = v >> 8 */
    474	lwi	r12, r8, 0	/* v = *(as + 0) */
    475	bslli	r9, r12, 24	/* t1 = v << 24 */
    476	or	r9, r11, r9	/* t1 = h | t1 */
    477	swi	r9, r5, 0	/* *(d + 0) = t1 */
    478	addi	r4, r4, -32	/* n = n - 32 */
    479	bneid	r4, d_bu3_loop	/* while (n) loop */
    480	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
    481	bri	d_block_done
    482
    483d_block_u1:
    484	bsrli	r11, r11, 24	/* h = h >> 24 */
    485d_bu1_loop:
    486	addi	r8, r8, -32	/* as = as - 32 */
    487	addi	r5, r5, -32	/* d = d - 32 */
    488	lwi	r12, r8, 28	/* v = *(as + 28) */
    489	bslli	r9, r12, 8	/* t1 = v << 8 */
    490	or	r9, r11, r9	/* t1 = h | t1 */
    491	swi	r9, r5, 28	/* *(d + 28) = t1 */
    492	bsrli	r11, r12, 24	/* h = v >> 24 */
    493	lwi	r12, r8, 24	/* v = *(as + 24) */
    494	bslli	r9, r12, 8	/* t1 = v << 8 */
    495	or	r9, r11, r9	/* t1 = h | t1 */
    496	swi	r9, r5, 24	/* *(d + 24) = t1 */
    497	bsrli	r11, r12, 24	/* h = v >> 24 */
    498	lwi	r12, r8, 20	/* v = *(as + 20) */
    499	bslli	r9, r12, 8	/* t1 = v << 8 */
    500	or	r9, r11, r9	/* t1 = h | t1 */
    501	swi	r9, r5, 20	/* *(d + 20) = t1 */
    502	bsrli	r11, r12, 24	/* h = v >> 24 */
    503	lwi	r12, r8, 16	/* v = *(as + 16) */
    504	bslli	r9, r12, 8	/* t1 = v << 8 */
    505	or	r9, r11, r9	/* t1 = h | t1 */
    506	swi	r9, r5, 16	/* *(d + 16) = t1 */
    507	bsrli	r11, r12, 24	/* h = v >> 24 */
    508	lwi	r12, r8, 12	/* v = *(as + 12) */
    509	bslli	r9, r12, 8	/* t1 = v << 8 */
    510	or	r9, r11, r9	/* t1 = h | t1 */
    511	swi	r9, r5, 12	/* *(d + 112) = t1 */
    512	bsrli	r11, r12, 24	/* h = v >> 24 */
    513	lwi	r12, r8, 8	/* v = *(as + 8) */
    514	bslli	r9, r12, 8	/* t1 = v << 8 */
    515	or	r9, r11, r9	/* t1 = h | t1 */
    516	swi	r9, r5, 8	/* *(d + 8) = t1 */
    517	bsrli	r11, r12, 24	/* h = v >> 24 */
    518	lwi	r12, r8, 4	/* v = *(as + 4) */
    519	bslli	r9, r12, 8	/* t1 = v << 8 */
    520	or	r9, r11, r9	/* t1 = h | t1 */
    521	swi	r9, r5, 4	/* *(d + 4) = t1 */
    522	bsrli	r11, r12, 24	/* h = v >> 24 */
    523	lwi	r12, r8, 0	/* v = *(as + 0) */
    524	bslli	r9, r12, 8	/* t1 = v << 8 */
    525	or	r9, r11, r9	/* t1 = h | t1 */
    526	swi	r9, r5, 0	/* *(d + 0) = t1 */
    527	addi	r4, r4, -32	/* n = n - 32 */
    528	bneid	r4, d_bu1_loop	/* while (n) loop */
    529	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
    530	bri	d_block_done
    531
    532d_block_u2:
    533	bsrli	r11, r11, 16	/* h = h >> 16 */
    534d_bu2_loop:
    535	addi	r8, r8, -32	/* as = as - 32 */
    536	addi	r5, r5, -32	/* d = d - 32 */
    537	lwi	r12, r8, 28	/* v = *(as + 28) */
    538	bslli	r9, r12, 16	/* t1 = v << 16 */
    539	or	r9, r11, r9	/* t1 = h | t1 */
    540	swi	r9, r5, 28	/* *(d + 28) = t1 */
    541	bsrli	r11, r12, 16	/* h = v >> 16 */
    542	lwi	r12, r8, 24	/* v = *(as + 24) */
    543	bslli	r9, r12, 16	/* t1 = v << 16 */
    544	or	r9, r11, r9	/* t1 = h | t1 */
    545	swi	r9, r5, 24	/* *(d + 24) = t1 */
    546	bsrli	r11, r12, 16	/* h = v >> 16 */
    547	lwi	r12, r8, 20	/* v = *(as + 20) */
    548	bslli	r9, r12, 16	/* t1 = v << 16 */
    549	or	r9, r11, r9	/* t1 = h | t1 */
    550	swi	r9, r5, 20	/* *(d + 20) = t1 */
    551	bsrli	r11, r12, 16	/* h = v >> 16 */
    552	lwi	r12, r8, 16	/* v = *(as + 16) */
    553	bslli	r9, r12, 16	/* t1 = v << 16 */
    554	or	r9, r11, r9	/* t1 = h | t1 */
    555	swi	r9, r5, 16	/* *(d + 16) = t1 */
    556	bsrli	r11, r12, 16	/* h = v >> 16 */
    557	lwi	r12, r8, 12	/* v = *(as + 12) */
    558	bslli	r9, r12, 16	/* t1 = v << 16 */
    559	or	r9, r11, r9	/* t1 = h | t1 */
    560	swi	r9, r5, 12	/* *(d + 112) = t1 */
    561	bsrli	r11, r12, 16	/* h = v >> 16 */
    562	lwi	r12, r8, 8	/* v = *(as + 8) */
    563	bslli	r9, r12, 16	/* t1 = v << 16 */
    564	or	r9, r11, r9	/* t1 = h | t1 */
    565	swi	r9, r5, 8	/* *(d + 8) = t1 */
    566	bsrli	r11, r12, 16	/* h = v >> 16 */
    567	lwi	r12, r8, 4	/* v = *(as + 4) */
    568	bslli	r9, r12, 16	/* t1 = v << 16 */
    569	or	r9, r11, r9	/* t1 = h | t1 */
    570	swi	r9, r5, 4	/* *(d + 4) = t1 */
    571	bsrli	r11, r12, 16	/* h = v >> 16 */
    572	lwi	r12, r8, 0	/* v = *(as + 0) */
    573	bslli	r9, r12, 16	/* t1 = v << 16 */
    574	or	r9, r11, r9	/* t1 = h | t1 */
    575	swi	r9, r5, 0	/* *(d + 0) = t1 */
    576	addi	r4, r4, -32	/* n = n - 32 */
    577	bneid	r4, d_bu2_loop	/* while (n) loop */
    578	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
    579
    580d_block_done:
    581	addi	r4, r0, 4	/* n = 4 */
    582	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
    583	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
    584
    585d_word_xfer:
    586	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
    587	rsub	r5, r4, r5		/* d = d - n */
    588	rsub	r6, r4, r6		/* s = s - n */
    589	rsub	r7, r4, r7		/* c = c - n */
    590
    591	andi	r9, r6, 3		/* t1 = s & 3 */
    592	/* if temp != 0, unaligned transfers needed */
    593	bnei	r9, d_word_unaligned
    594
    595d_word_aligned:
    596	addi	r4, r4,-4		/* n-- */
    597	lw	r9, r6, r4		/* t1 = *(s+n) */
    598	bneid	r4, d_word_aligned	/* loop */
    599	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
    600
    601	bri	d_word_done
    602
    603d_word_unaligned:
    604	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
    605	lw	r11, r8, r4		/* h = *(as + n) */
    606
    607	addi	r9, r9, -1
    608	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
    609	addi	r9, r9, -1
    610	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
    611
    612d_word_u3:
    613	bsrli	r11, r11, 8	/* h = h >> 8 */
    614d_wu3_loop:
    615	addi	r4, r4,-4	/* n = n - 4 */
    616	lw	r12, r8, r4	/* v = *(as + n) */
    617	bslli	r9, r12, 24	/* t1 = v << 24 */
    618	or	r9, r11, r9	/* t1 = h | t1 */
    619	sw	r9, r5, r4	/* *(d + n) = t1 */
    620	bneid	r4, d_wu3_loop	/* while (n) loop */
    621	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
    622
    623	bri	d_word_done
    624
    625d_word_u1:
    626	bsrli	r11, r11, 24	/* h = h >> 24 */
    627d_wu1_loop:
    628	addi	r4, r4,-4	/* n = n - 4 */
    629	lw	r12, r8, r4	/* v = *(as + n) */
    630	bslli	r9, r12, 8	/* t1 = v << 8 */
    631	or	r9, r11, r9	/* t1 = h | t1 */
    632	sw	r9, r5, r4	/* *(d + n) = t1 */
    633	bneid	r4, d_wu1_loop	/* while (n) loop */
    634	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
    635
    636	bri	d_word_done
    637
    638d_word_u2:
    639	bsrli	r11, r11, 16	/* h = h >> 16 */
    640d_wu2_loop:
    641	addi	r4, r4,-4	/* n = n - 4 */
    642	lw	r12, r8, r4	/* v = *(as + n) */
    643	bslli	r9, r12, 16	/* t1 = v << 16 */
    644	or	r9, r11, r9	/* t1 = h | t1 */
    645	sw	r9, r5, r4	/* *(d + n) = t1 */
    646	bneid	r4, d_wu2_loop	/* while (n) loop */
    647	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
    648
    649d_word_done:
    650
    651d_xfer_end:
    652d_xfer_end_loop:
    653	beqi	r7, a_done		/* while (c) */
    654	addi	r6, r6, -1		/* s-- */
    655	lbui	r9, r6, 0		/* t1 = *s */
    656	addi	r5, r5, -1		/* d-- */
    657	sbi	r9, r5, 0		/* *d = t1 */
    658	brid	d_xfer_end_loop		/* loop */
    659	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
    660
    661d_done:
    662	rtsd	r15, 8
    663	nop
    664
    665.size  memmove, . - memmove
    666.end memmove