cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

memmove.S (8025B)


      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
      4 */
      5
      6#include <linux/linkage.h>
      7#include <asm/asm.h>
      8
      9SYM_FUNC_START(__memmove)
     10SYM_FUNC_START_WEAK(memmove)
     11	/*
     12	 * Returns
     13	 *   a0 - dest
     14	 *
     15	 * Parameters
     16	 *   a0 - Inclusive first byte of dest
     17	 *   a1 - Inclusive first byte of src
     18	 *   a2 - Length of copy n
     19	 *
     20	 * Because the return matches the parameter register a0,
     21	 * we will not clobber or modify that register.
     22	 *
     23	 * Note: This currently only works on little-endian.
     24	 * To port to big-endian, reverse the direction of shifts
     25	 * in the 2 misaligned fixup copy loops.
     26	 */
     27
     28	/* Return if nothing to do */
     29	beq a0, a1, return_from_memmove
     30	beqz a2, return_from_memmove
     31
     32	/*
     33	 * Register Uses
     34	 *      Forward Copy: a1 - Index counter of src
     35	 *      Reverse Copy: a4 - Index counter of src
     36	 *      Forward Copy: t3 - Index counter of dest
     37	 *      Reverse Copy: t4 - Index counter of dest
     38	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
     39	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
     40	 *   Both Copy Modes: t0 - Link / Temporary for load-store
     41	 *   Both Copy Modes: t1 - Temporary for load-store
     42	 *   Both Copy Modes: t2 - Temporary for load-store
     43	 *   Both Copy Modes: a5 - dest to src alignment offset
     44	 *   Both Copy Modes: a6 - Shift ammount
     45	 *   Both Copy Modes: a7 - Inverse Shift ammount
     46	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
     47	 */
     48
     49	/*
     50	 * Solve for some register values now.
     51	 * Byte copy does not need t5 or t6.
     52	 */
     53	mv   t3, a0
     54	add  t4, a0, a2
     55	add  a4, a1, a2
     56
     57	/*
     58	 * Byte copy if copying less than (2 * SZREG) bytes. This can
     59	 * cause problems with the bulk copy implementation and is
     60	 * small enough not to bother.
     61	 */
     62	andi t0, a2, -(2 * SZREG)
     63	beqz t0, byte_copy
     64
     65	/*
     66	 * Now solve for t5 and t6.
     67	 */
     68	andi t5, t3, -SZREG
     69	andi t6, t4, -SZREG
     70	/*
     71	 * If dest(Register t3) rounded down to the nearest naturally
     72	 * aligned SZREG address, does not equal dest, then add SZREG
     73	 * to find the low-bound of SZREG alignment in the dest memory
     74	 * region.  Note that this could overshoot the dest memory
     75	 * region if n is less than SZREG.  This is one reason why
     76	 * we always byte copy if n is less than SZREG.
     77	 * Otherwise, dest is already naturally aligned to SZREG.
     78	 */
     79	beq  t5, t3, 1f
     80		addi t5, t5, SZREG
     81	1:
     82
     83	/*
     84	 * If the dest and src are co-aligned to SZREG, then there is
     85	 * no need for the full rigmarole of a full misaligned fixup copy.
     86	 * Instead, do a simpler co-aligned copy.
     87	 */
     88	xor  t0, a0, a1
     89	andi t1, t0, (SZREG - 1)
     90	beqz t1, coaligned_copy
     91	/* Fall through to misaligned fixup copy */
     92
     93misaligned_fixup_copy:
     94	bltu a1, a0, misaligned_fixup_copy_reverse
     95
     96misaligned_fixup_copy_forward:
     97	jal  t0, byte_copy_until_aligned_forward
     98
     99	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
    100	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
    101	sub  a5, a1, t3 /* Find the difference between src and dest */
    102	andi a1, a1, -SZREG /* Align the src pointer */
    103	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
    104
    105	/*
    106	 * Compute The Inverse Shift
    107	 * a7 = XLEN - a6 = XLEN + -a6
    108	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
    109	 * Add that to XLEN.  XLEN = SZREG * 8.
    110	 */
    111	not  a7, a6
    112	addi a7, a7, (SZREG * 8 + 1)
    113
    114	/*
    115	 * Fix Misalignment Copy Loop - Forward
    116	 * load_val0 = load_ptr[0];
    117	 * do {
    118	 * 	load_val1 = load_ptr[1];
    119	 * 	store_ptr += 2;
    120	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
    121	 *
    122	 * 	if (store_ptr == {a2})
    123	 * 		break;
    124	 *
    125	 * 	load_val0 = load_ptr[2];
    126	 * 	load_ptr += 2;
    127	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
    128	 *
    129	 * } while (store_ptr != store_ptr_end);
    130	 * store_ptr = store_ptr_end;
    131	 */
    132
    133	REG_L t0, (0 * SZREG)(a1)
    134	1:
    135	REG_L t1, (1 * SZREG)(a1)
    136	addi  t3, t3, (2 * SZREG)
    137	srl   t0, t0, a6
    138	sll   t2, t1, a7
    139	or    t2, t0, t2
    140	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
    141
    142	beq   t3, a2, 2f
    143
    144	REG_L t0, (2 * SZREG)(a1)
    145	addi  a1, a1, (2 * SZREG)
    146	srl   t1, t1, a6
    147	sll   t2, t0, a7
    148	or    t2, t1, t2
    149	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
    150
    151	bne   t3, t6, 1b
    152	2:
    153	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
    154
    155	add  a1, t3, a5 /* Restore the src pointer */
    156	j byte_copy_forward /* Copy any remaining bytes */
    157
    158misaligned_fixup_copy_reverse:
    159	jal  t0, byte_copy_until_aligned_reverse
    160
    161	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
    162	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
    163	sub  a5, a4, t4 /* Find the difference between src and dest */
    164	andi a4, a4, -SZREG /* Align the src pointer */
    165	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
    166
    167	/*
    168	 * Compute The Inverse Shift
    169	 * a7 = XLEN - a6 = XLEN + -a6
    170	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
    171	 * Add that to XLEN.  XLEN = SZREG * 8.
    172	 */
    173	not  a7, a6
    174	addi a7, a7, (SZREG * 8 + 1)
    175
    176	/*
    177	 * Fix Misalignment Copy Loop - Reverse
    178	 * load_val1 = load_ptr[0];
    179	 * do {
    180	 * 	load_val0 = load_ptr[-1];
    181	 * 	store_ptr -= 2;
    182	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
    183	 *
    184	 * 	if (store_ptr == {a2})
    185	 * 		break;
    186	 *
    187	 * 	load_val1 = load_ptr[-2];
    188	 * 	load_ptr -= 2;
    189	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
    190	 *
    191	 * } while (store_ptr != store_ptr_end);
    192	 * store_ptr = store_ptr_end;
    193	 */
    194
    195	REG_L t1, ( 0 * SZREG)(a4)
    196	1:
    197	REG_L t0, (-1 * SZREG)(a4)
    198	addi  t4, t4, (-2 * SZREG)
    199	sll   t1, t1, a7
    200	srl   t2, t0, a6
    201	or    t2, t1, t2
    202	REG_S t2, ( 1 * SZREG)(t4)
    203
    204	beq   t4, a2, 2f
    205
    206	REG_L t1, (-2 * SZREG)(a4)
    207	addi  a4, a4, (-2 * SZREG)
    208	sll   t0, t0, a7
    209	srl   t2, t1, a6
    210	or    t2, t0, t2
    211	REG_S t2, ( 0 * SZREG)(t4)
    212
    213	bne   t4, t5, 1b
    214	2:
    215	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
    216
    217	add  a4, t4, a5 /* Restore the src pointer */
    218	j byte_copy_reverse /* Copy any remaining bytes */
    219
    220/*
    221 * Simple copy loops for SZREG co-aligned memory locations.
    222 * These also make calls to do byte copies for any unaligned
    223 * data at their terminations.
    224 */
    225coaligned_copy:
    226	bltu a1, a0, coaligned_copy_reverse
    227
    228coaligned_copy_forward:
    229	jal t0, byte_copy_until_aligned_forward
    230
    231	1:
    232	REG_L t1, ( 0 * SZREG)(a1)
    233	addi  a1, a1, SZREG
    234	addi  t3, t3, SZREG
    235	REG_S t1, (-1 * SZREG)(t3)
    236	bne   t3, t6, 1b
    237
    238	j byte_copy_forward /* Copy any remaining bytes */
    239
    240coaligned_copy_reverse:
    241	jal t0, byte_copy_until_aligned_reverse
    242
    243	1:
    244	REG_L t1, (-1 * SZREG)(a4)
    245	addi  a4, a4, -SZREG
    246	addi  t4, t4, -SZREG
    247	REG_S t1, ( 0 * SZREG)(t4)
    248	bne   t4, t5, 1b
    249
    250	j byte_copy_reverse /* Copy any remaining bytes */
    251
    252/*
    253 * These are basically sub-functions within the function.  They
    254 * are used to byte copy until the dest pointer is in alignment.
    255 * At which point, a bulk copy method can be used by the
    256 * calling code.  These work on the same registers as the bulk
    257 * copy loops.  Therefore, the register values can be picked
    258 * up from where they were left and we avoid code duplication
    259 * without any overhead except the call in and return jumps.
    260 */
    261byte_copy_until_aligned_forward:
    262	beq  t3, t5, 2f
    263	1:
    264	lb   t1,  0(a1)
    265	addi a1, a1, 1
    266	addi t3, t3, 1
    267	sb   t1, -1(t3)
    268	bne  t3, t5, 1b
    269	2:
    270	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
    271
    272byte_copy_until_aligned_reverse:
    273	beq  t4, t6, 2f
    274	1:
    275	lb   t1, -1(a4)
    276	addi a4, a4, -1
    277	addi t4, t4, -1
    278	sb   t1,  0(t4)
    279	bne  t4, t6, 1b
    280	2:
    281	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
    282
    283/*
    284 * Simple byte copy loops.
    285 * These will byte copy until they reach the end of data to copy.
    286 * At that point, they will call to return from memmove.
    287 */
    288byte_copy:
    289	bltu a1, a0, byte_copy_reverse
    290
    291byte_copy_forward:
    292	beq  t3, t4, 2f
    293	1:
    294	lb   t1,  0(a1)
    295	addi a1, a1, 1
    296	addi t3, t3, 1
    297	sb   t1, -1(t3)
    298	bne  t3, t4, 1b
    299	2:
    300	ret
    301
    302byte_copy_reverse:
    303	beq  t4, t3, 2f
    304	1:
    305	lb   t1, -1(a4)
    306	addi a4, a4, -1
    307	addi t4, t4, -1
    308	sb   t1,  0(t4)
    309	bne  t4, t3, 1b
    310	2:
    311
    312return_from_memmove:
    313	ret
    314
    315SYM_FUNC_END(memmove)
    316SYM_FUNC_END(__memmove)